From 1983c75938975dc90b14d0cdb42bed2bb5761328 Mon Sep 17 00:00:00 2001 From: Pedram Navid <1045990+PedramNavid@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:35:50 -0700 Subject: [PATCH] [docs] Add tested snippets to embedded-elt ## Summary & Motivation This fixes some formatting issues in a Python code block and uses snippets for the code blocks in the example --- docs/content/api/modules.json | 2 +- docs/content/integrations/embedded-elt.mdx | 58 +++++++++++++++---- .../integrations/embedded_elt/__init__.py | 0 .../embedded_elt/postgres_snowflake.py | 42 ++++++++++++++ .../integrations/embedded_elt/s3_snowflake.py | 42 ++++++++++++++ .../integrations_tests/test_embedded_elt.py | 11 ++++ examples/docs_snippets/setup.py | 1 + examples/docs_snippets/tox.ini | 1 + .../dbt_project/target/manifest.json | 2 +- 9 files changed, 146 insertions(+), 13 deletions(-) create mode 100644 examples/docs_snippets/docs_snippets/integrations/embedded_elt/__init__.py create mode 100644 examples/docs_snippets/docs_snippets/integrations/embedded_elt/postgres_snowflake.py create mode 100644 examples/docs_snippets/docs_snippets/integrations/embedded_elt/s3_snowflake.py create mode 100644 examples/docs_snippets/docs_snippets_tests/integrations_tests/test_embedded_elt.py diff --git a/docs/content/api/modules.json b/docs/content/api/modules.json index ee29ad2ee14d1..a2b134013878e 100644 --- a/docs/content/api/modules.json +++ b/docs/content/api/modules.json @@ -1 +1 @@ -{"": {"dagster_pandera": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandera

\nimport itertools\nimport re\nfrom typing import TYPE_CHECKING, Callable, Sequence, Type, Union\n\nimport dagster._check as check\nimport pandas as pd\nimport pandera as pa\nfrom dagster import (\n    DagsterType,\n    TableColumn,\n    TableColumnConstraints,\n    TableConstraints,\n    TableSchema,\n    TypeCheck,\n    TypeCheckContext,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.libraries import DagsterLibraryRegistry\n\nfrom .version import __version__\n\n# NOTE: Pandera supports multiple dataframe libraries. Most of the alternatives\n# to pandas implement a pandas-like API wrapper around an underlying library\n# that can handle big data (a weakness of pandas). Typically this means the\n# data is only partly loaded into memory, or is distributed across multiple\n# nodes. Because Dagster types perform runtime validation within a single\n# Python process, it's not clear at present how to interface the more complex\n# validation computations on distributed dataframes with Dagster Types.\n\n# Therefore, for the time being dagster-pandera only supports pandas dataframes.\n# However, some commented-out scaffolding has been left in place for support of\n# alternatives in the future. These sections are marked with "TODO: pending\n# alternative dataframe support".\n\nif TYPE_CHECKING:\n    ValidatableDataFrame = pd.DataFrame\n\nDagsterLibraryRegistry.register("dagster-pandera", __version__)\n\n# ########################\n# ##### VALID DATAFRAME CLASSES\n# ########################\n\n# This layer of indirection is used because we may support alternative dataframe classes in the\n# future.\nVALID_DATAFRAME_CLASSES = (pd.DataFrame,)\n\n\n# ########################\n# ##### PANDERA SCHEMA TO DAGSTER TYPE\n# ########################\n\n\n
[docs]def pandera_schema_to_dagster_type(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> DagsterType:\n """Convert a Pandera dataframe schema to a `DagsterType`.\n\n The generated Dagster type will be given an automatically generated `name`. The schema's `title`\n property, `name` property, or class name (in that order) will be used. If neither `title` or\n `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.\n\n Additional metadata is also extracted from the Pandera schema and attached to the returned\n `DagsterType` as a metadata dictionary. The extracted metadata includes:\n\n - Descriptions on the schema and constituent columns and checks.\n - Data types for each column.\n - String representations of all column-wise checks.\n - String representations of all row-wise (i.e. "wide") checks.\n\n The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type\n check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all\n values in the dataframe, rather than stopping on the first error.\n\n If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:\n\n - `num_failures` total number of validation errors.\n - `failure_sample` a table containing up to the first 10 validation errors.\n\n Args:\n schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):\n\n Returns:\n DagsterType: Dagster Type constructed from the Pandera schema.\n\n """\n if not (\n isinstance(schema, pa.DataFrameSchema)\n or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))\n ):\n raise TypeError(\n "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"\n )\n\n name = _extract_name_from_pandera_schema(schema)\n norm_schema = (\n schema.to_schema()\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)\n else schema\n )\n tschema = _pandera_schema_to_table_schema(norm_schema)\n type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)\n\n return DagsterType(\n type_check_fn=type_check_fn,\n name=name,\n description=norm_schema.description,\n metadata={\n "schema": MetadataValue.table_schema(tschema),\n },\n typing_type=pd.DataFrame,\n )
\n\n\n# call next() on this to generate next unique Dagster Type name for anonymous schemas\n_anonymous_schema_name_generator = (f"DagsterPanderaDataframe{i}" for i in itertools.count(start=1))\n\n\ndef _extract_name_from_pandera_schema(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> str:\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel):\n return (\n getattr(schema.Config, "title", None)\n or getattr(schema.Config, "name", None)\n or schema.__name__\n )\n elif isinstance(schema, pa.DataFrameSchema):\n return schema.title or schema.name or next(_anonymous_schema_name_generator)\n\n\ndef _pandera_schema_to_type_check_fn(\n schema: pa.DataFrameSchema,\n table_schema: TableSchema,\n) -> Callable[[TypeCheckContext, object], TypeCheck]:\n def type_check_fn(_context, value: object) -> TypeCheck:\n if isinstance(value, VALID_DATAFRAME_CLASSES):\n try:\n # `lazy` instructs pandera to capture every (not just the first) validation error\n schema.validate(value, lazy=True)\n except pa.errors.SchemaErrors as e:\n return _pandera_errors_to_type_check(e, table_schema)\n except Exception as e:\n return TypeCheck(\n success=False,\n description=f"Unexpected error during validation: {e}",\n )\n else:\n return TypeCheck(\n success=False,\n description=(\n f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check_fn\n\n\nPANDERA_FAILURE_CASES_SCHEMA = TableSchema(\n columns=[\n TableColumn(\n name="schema_context",\n type="string",\n description="`Column` for column-wise checks, or `DataFrameSchema`",\n ),\n TableColumn(\n name="column",\n type="string",\n description="Column of value that failed the check, or `None` for wide checks.",\n ),\n TableColumn(\n name="check", type="string", description="Description of the failed Pandera check."\n ),\n TableColumn(name="check_number", description="Index of the failed check."),\n TableColumn(\n name="failure_case", type="number | string", description="Value that failed a check."\n ),\n TableColumn(\n name="index",\n type="number | string",\n description="Index (row) of value that failed a check.",\n ),\n ]\n)\n\n\ndef _pandera_errors_to_type_check(\n error: pa.errors.SchemaErrors, _table_schema: TableSchema\n) -> TypeCheck:\n return TypeCheck(\n success=False,\n description=str(error),\n )\n\n\ndef _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema:\n df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks)\n columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()]\n return TableSchema(columns=columns, constraints=df_constraints)\n\n\ndef _pandera_schema_wide_checks_to_table_constraints(\n checks: Sequence[Union[pa.Check, pa.Hypothesis]]\n) -> TableConstraints:\n return TableConstraints(other=[_pandera_check_to_table_constraint(check) for check in checks])\n\n\ndef _pandera_check_to_table_constraint(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _pandera_column_to_table_column(pa_column: pa.Column) -> TableColumn:\n constraints = TableColumnConstraints(\n nullable=pa_column.nullable,\n unique=pa_column.unique,\n other=[_pandera_check_to_column_constraint(pa_check) for pa_check in pa_column.checks],\n )\n name = check.not_none(pa_column.name, "name")\n name = name if isinstance(name, str) else "/".join(name)\n return TableColumn(\n name=name,\n type=str(pa_column.dtype),\n description=pa_column.description,\n constraints=constraints,\n )\n\n\nCHECK_OPERATORS = {\n "equal_to": "==",\n "not_equal_to": "!=",\n "less_than": "<",\n "less_than_or_equal_to": "<=",\n "greater_than": ">",\n "greater_than_or_equal_to": ">=",\n}\n\n\ndef _extract_operand(error_str: str) -> str:\n match = re.search(r"(?<=\\().+(?=\\))", error_str)\n return match.group(0) if match else ""\n\n\ndef _pandera_check_to_column_constraint(pa_check: pa.Check) -> str:\n if pa_check.description:\n return pa_check.description\n elif pa_check.name in CHECK_OPERATORS:\n assert isinstance(\n pa_check.error, str\n ), "Expected pandera check to have string `error` attr."\n return f"{CHECK_OPERATORS[pa_check.name]} {_extract_operand(pa_check.error)}"\n else:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _get_pandera_check_identifier(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return pa_check.description or pa_check.error or pa_check.name or str(pa_check)\n\n\n__all__ = [\n "pandera_schema_to_dagster_type",\n]\n
", "current_page_name": "_modules/dagster_pandera", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandera"}, "dagster_pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pipes

\nimport base64\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\nimport time\nimport warnings\nimport zlib\nfrom abc import ABC, abstractmethod\nfrom contextlib import ExitStack, contextmanager\nfrom io import StringIO\nfrom queue import Queue\nfrom threading import Event, Thread\nfrom typing import (\n    IO,\n    TYPE_CHECKING,\n    Any,\n    ClassVar,\n    Dict,\n    Generic,\n    Iterable,\n    Iterator,\n    Literal,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    TextIO,\n    Type,\n    TypedDict,\n    TypeVar,\n    Union,\n    cast,\n    get_args,\n)\n\nif TYPE_CHECKING:\n    from unittest.mock import MagicMock\n\n# ########################\n# ##### PROTOCOL\n# ########################\n\n# This represents the version of the protocol, rather than the version of the package. It must be\n# manually updated whenever there are changes to the protocol.\nPIPES_PROTOCOL_VERSION = "0.1"\n\nPipesExtras = Mapping[str, Any]\nPipesParams = Mapping[str, Any]\n\n\n# ##### MESSAGE\n\n\ndef _make_message(method: str, params: Optional[Mapping[str, Any]]) -> "PipesMessage":\n    return {\n        PIPES_PROTOCOL_VERSION_FIELD: PIPES_PROTOCOL_VERSION,\n        "method": method,\n        "params": params,\n    }\n\n\n# Can't use a constant for TypedDict key so this value is repeated in `ExtMessage` defn.\nPIPES_PROTOCOL_VERSION_FIELD = "__dagster_pipes_version"\n\n\nclass PipesMessage(TypedDict):\n    """A message sent from the external process to the orchestration process."""\n\n    __dagster_pipes_version: str\n    method: str\n    params: Optional[Mapping[str, Any]]\n\n\n###### PIPES CONTEXT\n\n\nclass PipesContextData(TypedDict):\n    """The serializable data passed from the orchestration process to the external process. This gets\n    wrapped in a :py:class:`PipesContext`.\n    """\n\n    asset_keys: Optional[Sequence[str]]\n    code_version_by_asset_key: Optional[Mapping[str, Optional[str]]]\n    provenance_by_asset_key: Optional[Mapping[str, Optional["PipesDataProvenance"]]]\n    partition_key: Optional[str]\n    partition_key_range: Optional["PipesPartitionKeyRange"]\n    partition_time_window: Optional["PipesTimeWindow"]\n    run_id: str\n    job_name: Optional[str]\n    retry_number: int\n    extras: Mapping[str, Any]\n\n\nclass PipesPartitionKeyRange(TypedDict):\n    """A range of partition keys."""\n\n    start: str\n    end: str\n\n\nclass PipesTimeWindow(TypedDict):\n    """A span of time delimited by a start and end timestamp. This is defined for time-based partitioning schemes."""\n\n    start: str  # timestamp\n    end: str  # timestamp\n\n\nclass PipesDataProvenance(TypedDict):\n    """Provenance information for an asset."""\n\n    code_version: str\n    input_data_versions: Mapping[str, str]\n    is_user_provided: bool\n\n\nPipesAssetCheckSeverity = Literal["WARN", "ERROR"]\n\nPipesMetadataRawValue = Union[int, float, str, Mapping[str, Any], Sequence[Any], bool, None]\n\n\nclass PipesMetadataValue(TypedDict):\n    type: "PipesMetadataType"\n    raw_value: PipesMetadataRawValue\n\n\n# Infer the type from the raw value on the orchestration end\nPIPES_METADATA_TYPE_INFER = "__infer__"\n\nPipesMetadataType = Literal[\n    "__infer__",\n    "text",\n    "url",\n    "path",\n    "notebook",\n    "json",\n    "md",\n    "float",\n    "int",\n    "bool",\n    "dagster_run",\n    "asset",\n    "null",\n]\n\n# ########################\n# ##### UTIL\n# ########################\n\n_T = TypeVar("_T")\n\n\n
[docs]class DagsterPipesError(Exception):\n pass
\n\n\n
[docs]class DagsterPipesWarning(Warning):\n pass
\n\n\ndef _assert_not_none(value: Optional[_T], desc: Optional[str] = None) -> _T:\n if value is None:\n raise DagsterPipesError(f"Missing required property: {desc}")\n return value\n\n\ndef _assert_defined_asset_property(value: Optional[_T], key: str) -> _T:\n return _assert_not_none(value, f"`{key}` is undefined. Current step does not target an asset.")\n\n\n# This should only be called under the precondition that the current step targets assets.\ndef _assert_single_asset(data: PipesContextData, key: str) -> None:\n asset_keys = data["asset_keys"]\n assert asset_keys is not None\n if len(asset_keys) != 1:\n raise DagsterPipesError(f"`{key}` is undefined. Current step targets multiple assets.")\n\n\ndef _resolve_optionally_passed_asset_key(\n data: PipesContextData,\n asset_key: Optional[str],\n method: str,\n) -> str:\n asset_key = _assert_opt_param_type(asset_key, str, method, "asset_key")\n\n defined_asset_keys = data["asset_keys"]\n if defined_asset_keys:\n if asset_key and asset_key not in defined_asset_keys:\n raise DagsterPipesError(\n f"Invalid asset key. Expected one of `{defined_asset_keys}`, got `{asset_key}`."\n )\n if not asset_key:\n if len(defined_asset_keys) != 1:\n raise DagsterPipesError(\n f"Calling `{method}` without passing an asset key is undefined. Current step"\n " targets multiple assets."\n )\n asset_key = defined_asset_keys[0]\n\n if not asset_key:\n raise DagsterPipesError(\n f"Calling `{method}` without passing an asset key is undefined. Current step"\n " does not target a specific asset."\n )\n\n return asset_key\n\n\ndef _assert_defined_partition_property(value: Optional[_T], key: str) -> _T:\n return _assert_not_none(\n value, f"`{key}` is undefined. Current step does not target any partitions."\n )\n\n\n# This should only be called under the precondition that the current steps targets assets.\ndef _assert_single_partition(data: PipesContextData, key: str) -> None:\n partition_key_range = data["partition_key_range"]\n assert partition_key_range is not None\n if partition_key_range["start"] != partition_key_range["end"]:\n raise DagsterPipesError(f"`{key}` is undefined. Current step targets multiple partitions.")\n\n\ndef _assert_defined_extra(extras: PipesExtras, key: str) -> Any:\n if key not in extras:\n raise DagsterPipesError(f"Extra `{key}` is undefined. Extras must be provided by user.")\n return extras[key]\n\n\ndef _assert_param_type(value: _T, expected_type: Any, method: str, param: str) -> _T:\n if not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected `{expected_type}`, got"\n f" `{type(value)}`."\n )\n return value\n\n\ndef _assert_opt_param_type(value: _T, expected_type: Any, method: str, param: str) -> _T:\n if not (isinstance(value, expected_type) or value is None):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected"\n f" `Optional[{expected_type}]`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_env_param_type(\n env_params: PipesParams, key: str, expected_type: Type[_T], cls: Type\n) -> _T:\n value = env_params.get(key)\n if not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{key}` passed from orchestration side to"\n f" `{cls.__name__}`. Expected `{expected_type}`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_opt_env_param_type(\n env_params: PipesParams, key: str, expected_type: Type[_T], cls: Type\n) -> Optional[_T]:\n value = env_params.get(key)\n if value is not None and not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{key}` passed from orchestration side to"\n f" `{cls.__name__}`. Expected `Optional[{expected_type}]`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_param_value(value: _T, expected_values: Iterable[_T], method: str, param: str) -> _T:\n if value not in expected_values:\n raise DagsterPipesError(\n f"Invalid value for parameter `{param}` of `{method}`. Expected one of"\n f" `{expected_values}`, got `{value}`."\n )\n return value\n\n\ndef _assert_opt_param_value(\n value: _T, expected_values: Sequence[_T], method: str, param: str\n) -> _T:\n if value is not None and value not in expected_values:\n raise DagsterPipesError(\n f"Invalid value for parameter `{param}` of `{method}`. Expected one of"\n f" `{expected_values}`, got `{value}`."\n )\n return value\n\n\ndef _json_serialize_param(value: Any, method: str, param: str) -> str:\n try:\n serialized = json.dumps(value)\n except (TypeError, OverflowError):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a JSON-serializable"\n f" type, got `{type(value)}`."\n )\n return serialized\n\n\n_METADATA_VALUE_KEYS = frozenset(PipesMetadataValue.__annotations__.keys())\n_METADATA_TYPES = frozenset(get_args(PipesMetadataType))\n\n\ndef _normalize_param_metadata(\n metadata: Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]],\n method: str,\n param: str,\n) -> Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]:\n _assert_param_type(metadata, dict, method, param)\n new_metadata: Dict[str, PipesMetadataValue] = {}\n for key, value in metadata.items():\n if not isinstance(key, str):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a dict with string"\n f" keys, got a key `{key}` of type `{type(key)}`."\n )\n elif isinstance(value, dict):\n if not {*value.keys()} == _METADATA_VALUE_KEYS:\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a dict with"\n " string keys and values that are either raw metadata values or dictionaries"\n f" with schema `{{raw_value: ..., type: ...}}`. Got a value `{value}`."\n )\n _assert_param_value(value["type"], _METADATA_TYPES, method, f"{param}.{key}.type")\n new_metadata[key] = cast(PipesMetadataValue, value)\n else:\n new_metadata[key] = {"raw_value": value, "type": PIPES_METADATA_TYPE_INFER}\n return new_metadata\n\n\ndef _param_from_env_var(env_var: str) -> Any:\n raw_value = os.environ.get(env_var)\n return decode_env_var(raw_value) if raw_value is not None else None\n\n\n
[docs]def encode_env_var(value: Any) -> str:\n """Encode value by serializing to JSON, compressing with zlib, and finally encoding with base64.\n `base64_encode(compress(to_json(value)))` in function notation.\n\n Args:\n value (Any): The value to encode. Must be JSON-serializable.\n\n Returns:\n str: The encoded value.\n """\n serialized = _json_serialize_param(value, "encode_env_var", "value")\n compressed = zlib.compress(serialized.encode("utf-8"))\n encoded = base64.b64encode(compressed)\n return encoded.decode("utf-8") # as string
\n\n\n
[docs]def decode_env_var(value: str) -> Any:\n """Decode a value by decoding from base64, decompressing with zlib, and finally deserializing from\n JSON. `from_json(decompress(base64_decode(value)))` in function notation.\n\n Args:\n value (Any): The value to decode.\n\n Returns:\n Any: The decoded value.\n """\n decoded = base64.b64decode(value)\n decompressed = zlib.decompress(decoded)\n return json.loads(decompressed.decode("utf-8"))
\n\n\ndef _emit_orchestration_inactive_warning() -> None:\n warnings.warn(\n "This process was not launched by a Dagster orchestration process. All calls to the"\n " `dagster-pipes` context or attempts to initialize `dagster-pipes` abstractions"\n " are no-ops.",\n category=DagsterPipesWarning,\n )\n\n\ndef _get_mock() -> "MagicMock":\n from unittest.mock import MagicMock\n\n return MagicMock()\n\n\nclass _PipesLogger(logging.Logger):\n def __init__(self, context: "PipesContext") -> None:\n super().__init__(name="dagster-pipes")\n self.addHandler(_PipesLoggerHandler(context))\n\n\nclass _PipesLoggerHandler(logging.Handler):\n def __init__(self, context: "PipesContext") -> None:\n super().__init__()\n self._context = context\n\n def emit(self, record: logging.LogRecord) -> None:\n self._context._write_message( # noqa: SLF001\n "log", {"message": record.getMessage(), "level": record.levelname}\n )\n\n\n# ########################\n# ##### IO - BASE\n# ########################\n\n\n
[docs]class PipesContextLoader(ABC):\n
[docs] @abstractmethod\n @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n """A `@contextmanager` that loads context data injected by the orchestration process.\n\n This method should read and yield the context data from the location specified by the passed in\n `PipesParams`.\n\n Args:\n params (PipesParams): The params provided by the context injector in the orchestration\n process.\n\n Yields:\n PipesContextData: The context data.\n """
\n\n\nT_MessageChannel = TypeVar("T_MessageChannel", bound="PipesMessageWriterChannel")\n\n\n
[docs]class PipesMessageWriter(ABC, Generic[T_MessageChannel]):\n
[docs] @abstractmethod\n @contextmanager\n def open(self, params: PipesParams) -> Iterator[T_MessageChannel]:\n """A `@contextmanager` that initializes a channel for writing messages back to Dagster.\n\n This method should takes the params passed by the orchestration-side\n :py:class:`PipesMessageReader` and use them to construct and yield a\n :py:class:`PipesMessageWriterChannel`.\n\n Args:\n params (PipesParams): The params provided by the message reader in the orchestration\n process.\n\n Yields:\n PipesMessageWriterChannel: Channel for writing messagse back to Dagster.\n """
\n\n\n
[docs]class PipesMessageWriterChannel(ABC, Generic[T_MessageChannel]):\n """Object that writes messages back to the Dagster orchestration process."""\n\n
[docs] @abstractmethod\n def write_message(self, message: PipesMessage) -> None:\n """Write a message to the orchestration process.\n\n Args:\n message (PipesMessage): The message to write.\n """
\n\n\n
[docs]class PipesParamsLoader(ABC):\n """Object that loads params passed from the orchestration process by the context injector and\n message reader. These params are used to respectively bootstrap the\n :py:class:`PipesContextLoader` and :py:class:`PipesMessageWriter`.\n """\n\n
[docs] @abstractmethod\n def is_dagster_pipes_process(self) -> bool:\n """Whether or not this process has been provided with provided with information to create\n a PipesContext or should instead return a mock.\n """
\n\n
[docs] @abstractmethod\n def load_context_params(self) -> PipesParams:\n """PipesParams: Load params passed by the orchestration-side context injector."""
\n\n
[docs] @abstractmethod\n def load_messages_params(self) -> PipesParams:\n """PipesParams: Load params passed by the orchestration-side message reader."""
\n\n\nT_BlobStoreMessageWriterChannel = TypeVar(\n "T_BlobStoreMessageWriterChannel", bound="PipesBlobStoreMessageWriterChannel"\n)\n\n\n
[docs]class PipesBlobStoreMessageWriter(PipesMessageWriter[T_BlobStoreMessageWriterChannel]):\n """Message writer channel that periodically uploads message chunks to some blob store endpoint."""\n\n def __init__(self, *, interval: float = 10):\n self.interval = interval\n\n
[docs] @contextmanager\n def open(self, params: PipesParams) -> Iterator[T_BlobStoreMessageWriterChannel]:\n """Construct and yield a :py:class:`PipesBlobStoreMessageWriterChannel`.\n\n Args:\n params (PipesParams): The params provided by the message reader in the orchestration\n process.\n\n Yields:\n PipesBlobStoreMessageWriterChannel: Channel that periodically uploads message chunks to\n a blob store.\n """\n channel = self.make_channel(params)\n with channel.buffered_upload_loop():\n yield channel
\n\n
[docs] @abstractmethod\n def make_channel(self, params: PipesParams) -> T_BlobStoreMessageWriterChannel: ...
\n\n\n
[docs]class PipesBlobStoreMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that periodically uploads message chunks to some blob store endpoint."""\n\n def __init__(self, *, interval: float = 10):\n self._interval = interval\n self._buffer: Queue[PipesMessage] = Queue()\n self._counter = 1\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n self._buffer.put(message)
\n\n
[docs] def flush_messages(self) -> Sequence[PipesMessage]:\n items = []\n while not self._buffer.empty():\n items.append(self._buffer.get())\n return items
\n\n
[docs] @abstractmethod\n def upload_messages_chunk(self, payload: StringIO, index: int) -> None: ...
\n\n
[docs] @contextmanager\n def buffered_upload_loop(self) -> Iterator[None]:\n thread = None\n is_task_complete = Event()\n try:\n thread = Thread(target=self._upload_loop, args=(is_task_complete,), daemon=True)\n thread.start()\n yield\n finally:\n is_task_complete.set()\n if thread:\n thread.join(timeout=60)
\n\n def _upload_loop(self, is_task_complete: Event) -> None:\n start_or_last_upload = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if self._buffer.empty() and is_task_complete.is_set():\n break\n elif is_task_complete.is_set() or (now - start_or_last_upload).seconds > self._interval:\n payload = "\\n".join([json.dumps(message) for message in self.flush_messages()])\n if len(payload) > 0:\n self.upload_messages_chunk(StringIO(payload), self._counter)\n start_or_last_upload = now\n self._counter += 1\n time.sleep(1)
\n\n\n
[docs]class PipesBufferedFilesystemMessageWriterChannel(PipesBlobStoreMessageWriterChannel):\n """Message writer channel that periodically writes message chunks to an endpoint mounted on the filesystem.\n\n Args:\n interval (float): interval in seconds between chunk uploads\n """\n\n def __init__(self, path: str, *, interval: float = 10):\n super().__init__(interval=interval)\n self._path = path\n\n
[docs] def upload_messages_chunk(self, payload: IO, index: int) -> None:\n message_path = os.path.join(self._path, f"{index}.json")\n with open(message_path, "w") as f:\n f.write(payload.read())
\n\n\n# ########################\n# ##### IO - DEFAULT\n# ########################\n\n\n
[docs]class PipesDefaultContextLoader(PipesContextLoader):\n """Context loader that loads context data from either a file or directly from the provided params.\n\n The location of the context data is configured by the params received by the loader. If the params\n include a key `path`, then the context data will be loaded from a file at the specified path. If\n the params instead include a key `data`, then the corresponding value should be a dict\n representing the context data.\n """\n\n FILE_PATH_KEY = "path"\n DIRECT_KEY = "data"\n\n
[docs] @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n if self.FILE_PATH_KEY in params:\n path = _assert_env_param_type(params, self.FILE_PATH_KEY, str, self.__class__)\n with open(path, "r") as f:\n data = json.load(f)\n yield data\n elif self.DIRECT_KEY in params:\n data = _assert_env_param_type(params, self.DIRECT_KEY, dict, self.__class__)\n yield cast(PipesContextData, data)\n else:\n raise DagsterPipesError(\n f'Invalid params for {self.__class__.__name__}, expected key "{self.FILE_PATH_KEY}"'\n f' or "{self.DIRECT_KEY}", received {params}',\n )
\n\n\n
[docs]class PipesDefaultMessageWriter(PipesMessageWriter):\n """Message writer that writes messages to either a file or the stdout or stderr stream.\n\n The write location is configured by the params received by the writer. If the params include a\n key `path`, then messages will be written to a file at the specified path. If the params instead\n include a key `stdio`, then messages then the corresponding value must specify either `stderr`\n or `stdout`, and messages will be written to the selected stream.\n """\n\n FILE_PATH_KEY = "path"\n STDIO_KEY = "stdio"\n STDERR = "stderr"\n STDOUT = "stdout"\n\n
[docs] @contextmanager\n def open(self, params: PipesParams) -> Iterator[PipesMessageWriterChannel]:\n if self.FILE_PATH_KEY in params:\n path = _assert_env_param_type(params, self.FILE_PATH_KEY, str, self.__class__)\n yield PipesFileMessageWriterChannel(path)\n elif self.STDIO_KEY in params:\n stream = _assert_env_param_type(params, self.STDIO_KEY, str, self.__class__)\n if stream == self.STDERR:\n yield PipesStreamMessageWriterChannel(sys.stderr)\n elif stream == self.STDOUT:\n yield PipesStreamMessageWriterChannel(sys.stdout)\n else:\n raise DagsterPipesError(\n f'Invalid value for key "std", expected "{self.STDERR}" or "{self.STDOUT}" but'\n f" received {stream}"\n )\n else:\n raise DagsterPipesError(\n f'Invalid params for {self.__class__.__name__}, expected key "path" or "std",'\n f" received {params}"\n )
\n\n\n
[docs]class PipesFileMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that writes one message per line to a file."""\n\n def __init__(self, path: str):\n self._path = path\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n with open(self._path, "a") as f:\n f.write(json.dumps(message) + "\\n")
\n\n\n
[docs]class PipesStreamMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that writes one message per line to a `TextIO` stream."""\n\n def __init__(self, stream: TextIO):\n self._stream = stream\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n self._stream.writelines((json.dumps(message), "\\n"))
\n\n\nDAGSTER_PIPES_CONTEXT_ENV_VAR = "DAGSTER_PIPES_CONTEXT"\nDAGSTER_PIPES_MESSAGES_ENV_VAR = "DAGSTER_PIPES_MESSAGES"\n\n\n
[docs]class PipesEnvVarParamsLoader(PipesParamsLoader):\n """Params loader that extracts params from environment variables."""\n\n
[docs] def is_dagster_pipes_process(self) -> bool:\n # use the presence of DAGSTER_PIPES_CONTEXT to discern if we are in a pipes process\n return DAGSTER_PIPES_CONTEXT_ENV_VAR in os.environ
\n\n
[docs] def load_context_params(self) -> PipesParams:\n return _param_from_env_var(DAGSTER_PIPES_CONTEXT_ENV_VAR)
\n\n
[docs] def load_messages_params(self) -> PipesParams:\n return _param_from_env_var(DAGSTER_PIPES_MESSAGES_ENV_VAR)
\n\n\n# ########################\n# ##### IO - S3\n# ########################\n\n\n
[docs]class PipesS3MessageWriter(PipesBlobStoreMessageWriter):\n """Message writer that writes messages by periodically writing message chunks to an S3 bucket.\n\n Args:\n client (Any): A boto3.client("s3") object.\n interval (float): interval in seconds between upload chunk uploads\n """\n\n # client is a boto3.client("s3") object\n def __init__(self, client: Any, *, interval: float = 10):\n super().__init__(interval=interval)\n # Not checking client type for now because it's a boto3.client object and we don't want to\n # depend on boto3.\n self._client = client\n\n
[docs] def make_channel(\n self,\n params: PipesParams,\n ) -> "PipesS3MessageWriterChannel":\n bucket = _assert_env_param_type(params, "bucket", str, self.__class__)\n key_prefix = _assert_opt_env_param_type(params, "key_prefix", str, self.__class__)\n return PipesS3MessageWriterChannel(\n client=self._client,\n bucket=bucket,\n key_prefix=key_prefix,\n interval=self.interval,\n )
\n\n\n
[docs]class PipesS3MessageWriterChannel(PipesBlobStoreMessageWriterChannel):\n """Message writer channel for writing messages by periodically writing message chunks to an S3 bucket.\n\n Args:\n client (Any): A boto3.client("s3") object.\n bucket (str): The name of the S3 bucket to write to.\n key_prefix (Optional[str]): An optional prefix to use for the keys of written blobs.\n interval (float): interval in seconds between upload chunk uploads\n """\n\n # client is a boto3.client("s3") object\n def __init__(\n self, client: Any, bucket: str, key_prefix: Optional[str], *, interval: float = 10\n ):\n super().__init__(interval=interval)\n self._client = client\n self._bucket = bucket\n self._key_prefix = key_prefix\n\n
[docs] def upload_messages_chunk(self, payload: IO, index: int) -> None:\n key = f"{self._key_prefix}/{index}.json" if self._key_prefix else f"{index}.json"\n self._client.put_object(\n Body=payload.read(),\n Bucket=self._bucket,\n Key=key,\n )
\n\n\n# ########################\n# ##### IO - DBFS\n# ########################\n\n\n
[docs]class PipesDbfsContextLoader(PipesContextLoader):\n """Context loader that reads context from a JSON file on DBFS."""\n\n
[docs] @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n unmounted_path = _assert_env_param_type(params, "path", str, self.__class__)\n path = os.path.join("/dbfs", unmounted_path.lstrip("/"))\n with open(path, "r") as f:\n yield json.load(f)
\n\n\n
[docs]class PipesDbfsMessageWriter(PipesBlobStoreMessageWriter):\n """Message writer that writes messages by periodically writing message chunks to a directory on DBFS."""\n\n
[docs] def make_channel(\n self,\n params: PipesParams,\n ) -> "PipesBufferedFilesystemMessageWriterChannel":\n unmounted_path = _assert_env_param_type(params, "path", str, self.__class__)\n return PipesBufferedFilesystemMessageWriterChannel(\n path=os.path.join("/dbfs", unmounted_path.lstrip("/")),\n interval=self.interval,\n )
\n\n\n# ########################\n# ##### CONTEXT\n# ########################\n\n\n
[docs]def open_dagster_pipes(\n *,\n context_loader: Optional[PipesContextLoader] = None,\n message_writer: Optional[PipesMessageWriter] = None,\n params_loader: Optional[PipesParamsLoader] = None,\n) -> "PipesContext":\n """Initialize the Dagster Pipes context.\n\n This function should be called near the entry point of a pipes process. It will load injected\n context information from Dagster and spin up the machinery for streaming messages back to\n Dagster.\n\n If the process was not launched by Dagster, this function will emit a warning and return a\n `MagicMock` object. This should make all operations on the context no-ops and prevent your code\n from crashing.\n\n Args:\n context_loader (Optional[PipesContextLoader]): The context loader to use. Defaults to\n :py:class:`PipesDefaultContextLoader`.\n message_writer (Optional[PipesMessageWriter]): The message writer to use. Defaults to\n :py:class:`PipesDefaultMessageWriter`.\n params_loader (Optional[PipesParamsLoader]): The params loader to use. Defaults to\n :py:class:`PipesEnvVarParamsLoader`.\n\n Returns:\n PipesContext: The initialized context.\n """\n if PipesContext.is_initialized():\n return PipesContext.get()\n\n params_loader = params_loader or PipesEnvVarParamsLoader()\n if params_loader.is_dagster_pipes_process():\n context_loader = context_loader or PipesDefaultContextLoader()\n message_writer = message_writer or PipesDefaultMessageWriter()\n context = PipesContext(params_loader, context_loader, message_writer)\n else:\n _emit_orchestration_inactive_warning()\n context = _get_mock()\n PipesContext.set(context)\n return context
\n\n\n
[docs]class PipesContext:\n """The context for a Dagster Pipes process.\n\n This class is analogous to :py:class:`~dagster.OpExecutionContext` on the Dagster side of the Pipes\n connection. It provides access to information such as the asset key(s) and partition key(s) in\n scope for the current step. It also provides methods for logging and emitting results that will\n be streamed back to Dagster.\n\n This class should not be directly instantiated by the user. Instead it should be initialized by\n calling :py:func:`open_dagster_pipes()`, which will return the singleton instance of this class.\n After `open_dagster_pipes()` has been called, the singleton instance can also be retrieved by\n calling :py:func:`PipesContext.get`.\n """\n\n _instance: ClassVar[Optional["PipesContext"]] = None\n\n
[docs] @classmethod\n def is_initialized(cls) -> bool:\n """bool: Whether the context has been initialized."""\n return cls._instance is not None
\n\n
[docs] @classmethod\n def set(cls, context: "PipesContext") -> None:\n """Set the singleton instance of the context."""\n cls._instance = context
\n\n
[docs] @classmethod\n def get(cls) -> "PipesContext":\n """Get the singleton instance of the context. Raises an error if the context has not been initialized."""\n if cls._instance is None:\n raise Exception(\n "PipesContext has not been initialized. You must call `open_dagster_pipes()`."\n )\n return cls._instance
\n\n def __init__(\n self,\n params_loader: PipesParamsLoader,\n context_loader: PipesContextLoader,\n message_writer: PipesMessageWriter,\n ) -> None:\n context_params = params_loader.load_context_params()\n messages_params = params_loader.load_messages_params()\n self._io_stack = ExitStack()\n self._data = self._io_stack.enter_context(context_loader.load_context(context_params))\n self._message_channel = self._io_stack.enter_context(message_writer.open(messages_params))\n self._message_channel.write_message(_make_message("opened", {}))\n self._logger = _PipesLogger(self)\n self._materialized_assets: Set[str] = set()\n self._closed: bool = False\n\n def __enter__(self) -> "PipesContext":\n return self\n\n def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:\n self.close()\n\n
[docs] def close(self) -> None:\n """Close the pipes connection. This will flush all buffered messages to the orchestration\n process and cause any further attempt to write a message to raise an error. This method is\n idempotent-- subsequent calls after the first have no effect.\n """\n if not self._closed:\n self._message_channel.write_message(_make_message("closed", {}))\n self._io_stack.close()\n self._closed = True
\n\n @property\n def is_closed(self) -> bool:\n """bool: Whether the context has been closed."""\n return self._closed\n\n def _write_message(self, method: str, params: Optional[Mapping[str, Any]] = None) -> None:\n if self._closed:\n raise DagsterPipesError("Cannot send message after pipes context is closed.")\n message = _make_message(method, params)\n self._message_channel.write_message(message)\n\n # ########################\n # ##### PUBLIC API\n # ########################\n\n @property\n def is_asset_step(self) -> bool:\n """bool: Whether the current step targets assets."""\n return self._data["asset_keys"] is not None\n\n @property\n def asset_key(self) -> str:\n """str: The AssetKey for the currently scoped asset. Raises an error if 0 or multiple assets\n are in scope.\n """\n asset_keys = _assert_defined_asset_property(self._data["asset_keys"], "asset_key")\n _assert_single_asset(self._data, "asset_key")\n return asset_keys[0]\n\n @property\n def asset_keys(self) -> Sequence[str]:\n """Sequence[str]: The AssetKeys for the currently scoped assets. Raises an error if no\n assets are in scope.\n """\n asset_keys = _assert_defined_asset_property(self._data["asset_keys"], "asset_keys")\n return asset_keys\n\n @property\n def provenance(self) -> Optional[PipesDataProvenance]:\n """Optional[PipesDataProvenance]: The provenance for the currently scoped asset. Raises an\n error if 0 or multiple assets are in scope.\n """\n provenance_by_asset_key = _assert_defined_asset_property(\n self._data["provenance_by_asset_key"], "provenance"\n )\n _assert_single_asset(self._data, "provenance")\n return next(iter(provenance_by_asset_key.values()))\n\n @property\n def provenance_by_asset_key(self) -> Mapping[str, Optional[PipesDataProvenance]]:\n """Mapping[str, Optional[PipesDataProvenance]]: Mapping of asset key to provenance for the\n currently scoped assets. Raises an error if no assets are in scope.\n """\n provenance_by_asset_key = _assert_defined_asset_property(\n self._data["provenance_by_asset_key"], "provenance_by_asset_key"\n )\n return provenance_by_asset_key\n\n @property\n def code_version(self) -> Optional[str]:\n """Optional[str]: The code version for the currently scoped asset. Raises an error if 0 or\n multiple assets are in scope.\n """\n code_version_by_asset_key = _assert_defined_asset_property(\n self._data["code_version_by_asset_key"], "code_version"\n )\n _assert_single_asset(self._data, "code_version")\n return next(iter(code_version_by_asset_key.values()))\n\n @property\n def code_version_by_asset_key(self) -> Mapping[str, Optional[str]]:\n """Mapping[str, Optional[str]]: Mapping of asset key to code version for the currently\n scoped assets. Raises an error if no assets are in scope.\n """\n code_version_by_asset_key = _assert_defined_asset_property(\n self._data["code_version_by_asset_key"], "code_version_by_asset_key"\n )\n return code_version_by_asset_key\n\n @property\n def is_partition_step(self) -> bool:\n """bool: Whether the current step is scoped to one or more partitions."""\n return self._data["partition_key_range"] is not None\n\n @property\n def partition_key(self) -> str:\n """str: The partition key for the currently scoped partition. Raises an error if 0 or\n multiple partitions are in scope.\n """\n partition_key = _assert_defined_partition_property(\n self._data["partition_key"], "partition_key"\n )\n return partition_key\n\n @property\n def partition_key_range(self) -> "PipesPartitionKeyRange":\n """PipesPartitionKeyRange: The partition key range for the currently scoped partition or\n partitions. Raises an error if no partitions are in scope.\n """\n partition_key_range = _assert_defined_partition_property(\n self._data["partition_key_range"], "partition_key_range"\n )\n return partition_key_range\n\n @property\n def partition_time_window(self) -> Optional["PipesTimeWindow"]:\n """Optional[PipesTimeWindow]: The partition time window for the currently scoped partition\n or partitions. Returns None if partitions in scope are not temporal. Raises an error if no\n partitions are in scope.\n """\n # None is a valid value for partition_time_window, but we check that a partition key range\n # is defined.\n _assert_defined_partition_property(\n self._data["partition_key_range"], "partition_time_window"\n )\n return self._data["partition_time_window"]\n\n @property\n def run_id(self) -> str:\n """str: The run ID for the currently executing pipeline run."""\n return self._data["run_id"]\n\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The job name for the currently executing run. Returns None if the run is\n not derived from a job.\n """\n return self._data["job_name"]\n\n @property\n def retry_number(self) -> int:\n """int: The retry number for the currently executing run."""\n return self._data["retry_number"]\n\n
[docs] def get_extra(self, key: str) -> Any:\n """Get the value of an extra provided by the user. Raises an error if the extra is not defined.\n\n Args:\n key (str): The key of the extra.\n\n Returns:\n Any: The value of the extra.\n """\n return _assert_defined_extra(self._data["extras"], key)
\n\n @property\n def extras(self) -> Mapping[str, Any]:\n """Mapping[str, Any]: Key-value map for all extras provided by the user."""\n return self._data["extras"]\n\n # ##### WRITE\n\n
[docs] def report_asset_materialization(\n self,\n metadata: Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]] = None,\n data_version: Optional[str] = None,\n asset_key: Optional[str] = None,\n ) -> None:\n """Report to Dagster that an asset has been materialized. Streams a payload containing\n materialization information back to Dagster. If no assets are in scope, raises an error.\n\n Args:\n metadata (Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]]):\n Metadata for the materialized asset. Defaults to None.\n data_version (Optional[str]): The data version for the materialized asset.\n Defaults to None.\n asset_key (Optional[str]): The asset key for the materialized asset. If only a\n single asset is in scope, default to that asset's key. If multiple assets are in scope,\n this must be set explicitly or an error will be raised.\n """\n asset_key = _resolve_optionally_passed_asset_key(\n self._data, asset_key, "report_asset_materialization"\n )\n if asset_key in self._materialized_assets:\n raise DagsterPipesError(\n f"Calling `report_asset_materialization` with asset key `{asset_key}` is undefined."\n " Asset has already been materialized, so no additional data can be reported"\n " for it."\n )\n metadata = (\n _normalize_param_metadata(metadata, "report_asset_materialization", "metadata")\n if metadata\n else None\n )\n data_version = _assert_opt_param_type(\n data_version, str, "report_asset_materialization", "data_version"\n )\n self._write_message(\n "report_asset_materialization",\n {"asset_key": asset_key, "data_version": data_version, "metadata": metadata},\n )\n self._materialized_assets.add(asset_key)
\n\n
[docs] def report_asset_check(\n self,\n check_name: str,\n passed: bool,\n severity: PipesAssetCheckSeverity = "ERROR",\n metadata: Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]] = None,\n asset_key: Optional[str] = None,\n ) -> None:\n """Report to Dagster that an asset check has been performed. Streams a payload containing\n check result information back to Dagster. If no assets or associated checks are in scope, raises an error.\n\n Args:\n check_name (str): The name of the check.\n passed (bool): Whether the check passed.\n severity (PipesAssetCheckSeverity): The severity of the check. Defaults to "ERROR".\n metadata (Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]]):\n Metadata for the check. Defaults to None.\n asset_key (Optional[str]): The asset key for the check. If only a single asset is in\n scope, default to that asset's key. If multiple assets are in scope, this must be\n set explicitly or an error will be raised.\n """\n asset_key = _resolve_optionally_passed_asset_key(\n self._data, asset_key, "report_asset_check"\n )\n check_name = _assert_param_type(check_name, str, "report_asset_check", "check_name")\n passed = _assert_param_type(passed, bool, "report_asset_check", "passed")\n metadata = (\n _normalize_param_metadata(metadata, "report_asset_check", "metadata")\n if metadata\n else None\n )\n self._write_message(\n "report_asset_check",\n {\n "asset_key": asset_key,\n "check_name": check_name,\n "passed": passed,\n "metadata": metadata,\n "severity": severity,\n },\n )
\n\n @property\n def log(self) -> logging.Logger:\n """logging.Logger: A logger that streams log messages back to Dagster."""\n return self._logger
\n
", "current_page_name": "_modules/dagster_pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pipes"}, "index": {"alabaster_version": "0.7.13", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "favicon_url": null, "logo_url": null, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "Overview: module code"}}, "dagster": {"_config": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_schema

\nfrom typing import TYPE_CHECKING, Any, Dict, List, Mapping, Sequence, Type, Union\n\nfrom typing_extensions import TypeAlias\n\nif TYPE_CHECKING:\n    from dagster._config import ConfigType, Field\n\n# Eventually, the below `UserConfigSchema` should be renamed to `ConfigSchema` and the class\n# definition should be dropped. The reason we don't do this now is that sphinx autodoc doesn't\n# support type aliases, so there is no good way to gracefully attach a docstring to this and have it\n# show up in the docs. See: https://github.com/sphinx-doc/sphinx/issues/8934\n#\n# Unfortunately mypy doesn't support recursive types, which would be used to properly define the\n# List/Dict elements of this union: `Dict[str, ConfigSchema]`, `List[ConfigSchema]`.\nUserConfigSchema: TypeAlias = Union[\n    Type[Union[bool, float, int, str]],\n    Type[Union[Dict[Any, Any], List[Any]]],\n    "ConfigType",\n    "Field",\n    Mapping[str, Any],\n    Sequence[Any],\n]\n\n\n
[docs]class ConfigSchema:\n """Placeholder type for config schemas.\n\n Any time that it appears in documentation, it means that any of the following types are\n acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/_config/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_schema"}, "config_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import TYPE_CHECKING, Dict, Iterator, Optional, Sequence, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from .snap import ConfigSchemaSnapshot, ConfigTypeSnap\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """The class backing DagsterTypes as they are used processing configuration data."""\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[Sequence["ConfigType"]] = None,\n    ):\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[Sequence[ConfigType]] = (\n            check.sequence_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n        # memoized snap representation\n        self._snap: Optional["ConfigTypeSnap"] = None\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n    def get_snapshot(self) -> "ConfigTypeSnap":\n        from .snap import snap_from_config_type\n\n        if self._snap is None:\n            self._snap = snap_from_config_type(self)\n\n        return self._snap\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        yield self\n\n    def get_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n        from .snap import ConfigSchemaSnapshot\n\n        return ConfigSchemaSnapshot({ct.key: ct.get_snapshot() for ct in self.type_iterator()})\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self,\n        key: str,\n        given_name: Optional[str],\n        scalar_kind: ConfigScalarKind,\n        **kwargs: typing.Any,\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Noneable, self).__init__(\n key=f"Noneable.{self.inner_type.key}",\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Array, self).__init__(\n key=f"Array.{self.inner_type.key}",\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @public\n @property\n def description(self) -> str:\n """A human-readable description of this Array type."""\n return f"List of {self.key}"\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: Sequence[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.sequence_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(f"Should never reach this. config_value should be pre-validated. Got {value}")\n\n @classmethod\n def from_python_enum(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])\n\n @classmethod\n def from_python_enum_direct_values(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum, where the direct values are passed instead of symbolic values (IE, enum.symbol.value as opposed to enum.symbol).\n\n This is necessary for internal usage, as the symbolic values are not serializable.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED.value\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v.value) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self,\n scalar_type: typing.Any,\n non_scalar_schema: UserConfigSchema,\n _key: Optional[str] = None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = check.inst(\n cast(ConfigType, resolve_to_config_type(scalar_type)), ConfigType\n )\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", f"ScalarUnion.{self.scalar_type.key}-{self.non_scalar_type.key}"\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.scalar_type.type_iterator()\n yield from self.non_scalar_type.type_iterator()\n yield from super().type_iterator()
\n\n\nConfigAnyInstance: Any = Any()\nConfigBoolInstance: Bool = Bool()\nConfigFloatInstance: Float = Float()\nConfigIntInstance: Int = Int()\nConfigStringInstance: String = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed(f"Scalar {type_name} is not supported")\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/_config/config_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_type"}, "field": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field

\nfrom typing import Any, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster._serdes import serialize_value\nfrom dagster._seven import is_subclass\nfrom dagster._utils import is_enum_value\nfrom dagster._utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj) -> bool:\n    return isinstance(obj, type) and is_subclass(obj, ConfigType)\n\n\ndef helpful_list_error_string() -> str:\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(obj: Union[ConfigType, UserConfigSchema]) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(obj, ConfigType):\n        return obj\n\n    if isinstance(obj, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(obj) == 1:\n            key = next(iter(obj.keys()))\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid key in map specification: {key!r} in map {obj}"\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:  # type: ignore\n                    raise DagsterInvalidDefinitionError(\n                        f"Non-scalar key in map specification: {key!r} in map {obj}"\n                    )\n\n                inner_type = resolve_to_config_type(obj[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid value in map specification: {obj[str]!r} in map {obj}"\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(obj)\n\n    if isinstance(obj, list):\n        if len(obj) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(obj[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                f"Invalid member of array specification: {obj[0]!r} in list {obj}"\n            )\n        return Array(inner_type)\n\n    if BuiltinEnum.contains(obj):\n        return ConfigType.from_builtin_enum(obj)\n\n    from .primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if is_supported_config_python_builtin(obj):\n        return remap_python_builtin_for_config(obj)\n\n    if obj is None:\n        return ConfigAnyInstance\n\n    # Special error messages for passing a DagsterType\n    from dagster._core.types.dagster_type import DagsterType, List, ListType\n    from dagster._core.types.python_set import Set, _TypedPythonSet\n    from dagster._core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(obj):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {obj} to resolve_to_config_type. This error usually"\n            " occurs when you pass a dagster config type class instead of a class instance into"\n            ' another dagster config type. E.g. "Noneable(Permissive)" should instead be'\n            ' "Noneable(Permissive())".',\n        )\n\n    if isinstance(obj, type) and is_subclass(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed a DagsterType class {obj!r} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}"\n        )\n\n    if is_closed_python_optional_type(obj):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(obj):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed in {obj} to the config system. Types from "\n            "the typing module in python are not allowed in the config system. "\n            "You must use types that are imported from dagster or primitive types "\n            "such as bool, int, etc."\n        )\n\n    if obj is List or isinstance(obj, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if obj is Set or isinstance(obj, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if obj is Tuple or isinstance(obj, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed an instance of DagsterType {obj.display_name} to the config "\n            f"system (Repr of type: {obj!r}). "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}",\n        )\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders for custom, and on other pluggable components of the system, such as resources, loggers,\n and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n f"Attempted to pass {config!r} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n )\n return config_type\n\n def __init__(\n self,\n config: Any,\n default_value: Any = FIELD_NO_DEFAULT_PROVIDED,\n is_required: Optional[bool] = None,\n description: Optional[str] = None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self._description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values],\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @public\n @property\n def is_required(self) -> bool:\n """Whether a value for this field must be provided at runtime.\n\n Cannot be True if a default value is provided.\n """\n return self._is_required\n\n @public\n @property\n def default_provided(self) -> bool:\n """Was a default value provided.\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @public\n @property\n def default_value(self) -> Any:\n """The default value for the field.\n\n Raises an exception if no default value was provided.\n """\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of this config field, if provided."""\n return self._description\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self) -> str:\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default=(\n "@" if self._default_value == FIELD_NO_DEFAULT_PROVIDED else self._default_value\n ),\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj: object, param_name: str) -> Optional[Field]:\n return check.opt_inst_param(cast(Optional[Field], obj), param_name, Field)\n
", "current_page_name": "_modules/dagster/_config/field", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field"}, "field_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field_utils

\n# encoding: utf-8\nimport hashlib\nimport os\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster._config import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        for field in self.fields.values():\n            yield from field.config_type.type_iterator()\n        yield from super().type_iterator()\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n    defined_cls_inst._initialized = False  # noqa: SLF001\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef compute_fields_hash(fields, description, field_aliases=None):\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"foo": "bar"} means that someone could use "bar" instead of "foo" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n # if we hit in the field cache - skip double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )\n self._initialized = True
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @public\n @property\n def key_label_name(self) -> Optional[str]:\n """Name which describes the role of keys in the map, if provided."""\n return self.given_name\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.key_type.type_iterator()\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )\n self._initialized = True
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )\n self._initialized = True
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Mapping[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Mapping[str, object]) -> Mapping[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Mapping[str, "Field"]:\n check.mapping_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: Sequence[object], stack: List[str]) -> Array:\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Mapping[object, object], stack: List[str]) -> Map:\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = next(iter(the_dict.keys()))\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n f"Map dict must have a scalar type as its only key. Got key {key!r}",\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}"\n .format(repr(the_dict[key])),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, Mapping):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = next(iter(potential_type.keys()))\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n\n\ndef config_dictionary_from_values(\n values: Mapping[str, Any], config_field: "Field"\n) -> Dict[str, Any]:\n """Converts a set of config values into a dictionary representation,\n in particular converting EnvVar objects into Dagster config inputs\n and processing data structures such as dicts, lists, and structured Config classes.\n """\n assert ConfigTypeKind.is_shape(config_field.config_type.kind)\n\n from dagster._config.pythonic_config import _config_value_to_dict_representation\n\n return check.is_dict(_config_value_to_dict_representation(None, values))\n\n\ndef _create_direct_access_exception(cls: Type, env_var_name: str) -> Exception:\n return RuntimeError(\n f'Attempted to directly retrieve environment variable {cls.__name__}("{env_var_name}").'\n f" {cls.__name__} defers resolution of the environment variable value until run time, and"\n " should only be used as input to Dagster config or resources.\\n\\nTo access the"\n f" environment variable value, call `get_value` on the {cls.__name__}, or use os.getenv"\n " directly."\n )\n\n\nclass IntEnvVar(int):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to an int value when the config is\n loaded.\n """\n\n name: str\n\n @classmethod\n def create(cls, name: str) -> "IntEnvVar":\n var = IntEnvVar(0)\n var.name = name\n return var\n\n def __int__(self) -> int:\n """Raises an exception of the EnvVar value is directly accessed. Users should instead use\n the `get_value` method, or use the EnvVar as an input to Dagster config or resources.\n """\n raise _create_direct_access_exception(self.__class__, self.env_var_name)\n\n def __str__(self) -> str:\n return str(int(self))\n\n def get_value(self, default: Optional[int] = None) -> Optional[int]:\n """Returns the value of the environment variable, or the default value if the\n environment variable is not set. If no default is provided, None will be returned.\n """\n value = os.getenv(self.name, default=default)\n return int(value) if value else None\n\n @property\n def env_var_name(self) -> str:\n """Returns the name of the environment variable."""\n return self.name\n\n\nclass EnvVar(str):\n """Class used to represent an environment variable in the Dagster config system.\n\n This class is intended to be used to populate config fields or resources.\n The environment variable will be resolved to a string value when the config is\n loaded.\n\n To access the value of the environment variable, use the `get_value` method.\n """\n\n @classmethod\n def int(cls, name: str) -> "IntEnvVar":\n return IntEnvVar.create(name=name)\n\n def __str__(self) -> str:\n """Raises an exception of the EnvVar value is directly accessed. Users should instead use\n the `get_value` method, or use the EnvVar as an input to Dagster config or resources.\n """\n raise _create_direct_access_exception(self.__class__, self.env_var_name)\n\n @property\n def env_var_name(self) -> str:\n """Returns the name of the environment variable."""\n return super().__str__()\n\n def get_value(self, default: Optional[str] = None) -> Optional[str]:\n """Returns the value of the environment variable, or the default value if the\n environment variable is not set. If no default is provided, None will be returned.\n """\n return os.getenv(self.env_var_name, default=default)\n
", "current_page_name": "_modules/dagster/_config/field_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field_utils"}, "pythonic_config": {"config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.config

\nimport re\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Dict,\n    List,\n    Mapping,\n    Optional,\n    Set,\n    Type,\n    cast,\n)\n\nfrom pydantic import BaseModel\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster import (\n    Field,\n    Field as DagsterField,\n    Shape,\n)\nfrom dagster._config.field_utils import (\n    EnvVar,\n    IntEnvVar,\n    Permissive,\n)\nfrom dagster._core.definitions.definition_config_schema import (\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidConfigDefinitionError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPythonicConfigDefinitionError,\n)\nfrom dagster._utils.cached_method import CACHED_METHOD_FIELD_SUFFIX\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .conversion_utils import _convert_pydantic_field, safe_is_subclass\nfrom .pydantic_compat_layer import (\n    USING_PYDANTIC_2,\n    ModelFieldCompat,\n    model_config,\n    model_fields,\n)\nfrom .typing_utils import BaseConfigMeta\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nINTERNAL_MARKER = "__internal__"\n\n# ensure that this ends with the internal marker so we can do a single check\nassert CACHED_METHOD_FIELD_SUFFIX.endswith(INTERNAL_MARKER)\n\n\nclass MakeConfigCacheable(BaseModel):\n    """This class centralizes and implements all the chicanery we need in order\n    to support caching decorators. If we decide this is a bad idea we can remove it\n    all in one go.\n    """\n\n    # Pydantic config for this class\n    # Cannot use kwargs for base class as this is not support for pydnatic<1.8\n    class Config:\n        # Various pydantic model config (https://docs.pydantic.dev/usage/model_config/)\n        # Necessary to allow for caching decorators\n        arbitrary_types_allowed = True\n        # Avoid pydantic reading a cached property class as part of the schema\n        if USING_PYDANTIC_2:\n            ignored_types = (cached_property,)\n        else:\n            keep_untouched = (cached_property,)\n        # Ensure the class is serializable, for caching purposes\n        frozen = True\n\n    def __setattr__(self, name: str, value: Any):\n        from .resource import ConfigurableResourceFactory\n\n        # This is a hack to allow us to set attributes on the class that are not part of the\n        # config schema. Pydantic will normally raise an error if you try to set an attribute\n        # that is not part of the schema.\n\n        if self._is_field_internal(name):\n            object.__setattr__(self, name, value)\n            return\n\n        try:\n            return super().__setattr__(name, value)\n        except (TypeError, ValueError) as e:\n            clsname = self.__class__.__name__\n            if "Instance is frozen" in str(  # Pydantic 2.x error\n                e\n            ) or "is immutable and does not support item assignment" in str(  # Pydantic 1.x error\n                e\n            ):\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support item assignment,"\n                        " as it inherits from 'pydantic.BaseModel' with frozen=True. If trying to"\n                        " maintain state on this resource, consider building a separate, stateful"\n                        " client class, and provide a method on the resource to construct and"\n                        " return the stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support item"\n                        " assignment, as it inherits from 'pydantic.BaseModel' with frozen=True."\n                    ) from e\n            elif "object has no field" in str(e):\n                field_name = check.not_none(\n                    re.search(r"object has no field \\"(.*)\\"", str(e))\n                ).group(1)\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\". If trying to maintain"\n                        " state on this resource, consider building a separate, stateful client"\n                        " class, and provide a method on the resource to construct and return the"\n                        " stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\"."\n                    ) from e\n            else:\n                raise\n\n    def _is_field_internal(self, name: str) -> bool:\n        return name.endswith(INTERNAL_MARKER)\n\n\nT = TypeVar("T")\n\n\ndef ensure_env_vars_set_post_init(set_value: T, input_value: Any) -> T:\n    """Pydantic 2.x utility. Ensures that Pydantic field values are set to the appropriate\n    EnvVar or IntEnvVar objects post-model-instantiation, since Pydantic 2.x will cast\n    EnvVar or IntEnvVar values to raw strings or ints as part of the model instantiation process.\n    """\n    if isinstance(set_value, dict) and isinstance(input_value, dict):\n        for key, value in input_value.items():\n            if isinstance(value, (EnvVar, IntEnvVar)):\n                set_value[key] = value\n            elif isinstance(value, (dict, list)):\n                set_value[key] = ensure_env_vars_set_post_init(set_value[key], value)\n    if isinstance(set_value, List) and isinstance(input_value, List):\n        for i in range(len(set_value)):\n            value = input_value[i]\n            if isinstance(value, (EnvVar, IntEnvVar)):\n                set_value[i] = value\n            elif isinstance(value, (dict, list)):\n                set_value[i] = ensure_env_vars_set_post_init(set_value[i], value)\n\n    return set_value\n\n\n
[docs]class Config(MakeConfigCacheable, metaclass=BaseConfigMeta):\n """Base class for Dagster configuration models, used to specify config schema for\n ops and assets. Subclasses :py:class:`pydantic.BaseModel`.\n\n Example definition:\n\n .. code-block:: python\n\n from pydantic import Field\n\n class MyAssetConfig(Config):\n my_str: str = "my_default_string"\n my_int_list: List[int]\n my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_with_config(config: MyAssetConfig):\n assert config.my_str == "my_default_string"\n assert config.my_int_list == [1, 2, 3]\n assert config.my_bool_with_metadata == False\n\n asset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n\n """\n\n def __init__(self, **config_dict) -> None:\n """This constructor is overridden to handle any remapping of raw config dicts to\n the appropriate config classes. For example, discriminated unions are represented\n in Dagster config as dicts with a single key, which is the discriminator value.\n """\n modified_data = {}\n for key, value in config_dict.items():\n field = model_fields(self).get(key)\n\n # This is useful in Pydantic 2.x when reconstructing a config object from a dict\n # e.g. when instantiating a resource at runtime from its config dict\n # In Pydantic 1.x, this is a no-op, since a non-required field without a\n # value provided will default to None (required & optional are the same in 1.x)\n if field and not field.is_required() and value is None:\n continue\n\n if field and field.discriminator:\n nested_dict = value\n\n discriminator_key = check.not_none(field.discriminator)\n if isinstance(value, Config):\n nested_dict = _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key,\n value._get_non_none_public_field_values(), # noqa: SLF001\n )\n\n nested_items = list(check.is_dict(nested_dict).items())\n check.invariant(\n len(nested_items) == 1,\n "Discriminated union must have exactly one key",\n )\n discriminated_value, nested_values = nested_items[0]\n\n modified_data[key] = {\n **nested_values,\n discriminator_key: discriminated_value,\n }\n else:\n modified_data[key] = value\n\n for key, field in model_fields(self).items():\n if field.is_required() and key not in modified_data:\n modified_data[key] = None\n\n super().__init__(**modified_data)\n if USING_PYDANTIC_2:\n self.__dict__ = ensure_env_vars_set_post_init(self.__dict__, modified_data)\n\n def _convert_to_config_dictionary(self) -> Mapping[str, Any]:\n """Converts this Config object to a Dagster config dictionary, in the same format as the dictionary\n accepted as run config or as YAML in the launchpad.\n\n Inner fields are recursively converted to dictionaries, meaning nested config objects\n or EnvVars will be converted to the appropriate dictionary representation.\n """\n public_fields = self._get_non_none_public_field_values()\n return {\n k: _config_value_to_dict_representation(model_fields(self).get(k), v)\n for k, v in public_fields.items()\n }\n\n def _get_non_none_public_field_values(self) -> Mapping[str, Any]:\n """Returns a dictionary representation of this config object,\n ignoring any private fields, and any optional fields that are None.\n\n Inner fields are returned as-is in the dictionary,\n meaning any nested config objects will be returned as config objects, not dictionaries.\n """\n output = {}\n for key, value in self.__dict__.items():\n if self._is_field_internal(key):\n continue\n field = model_fields(self).get(key)\n\n if field:\n alias = field.alias or key\n output[alias] = value\n else:\n output[key] = value\n return output\n\n @classmethod\n def to_config_schema(cls) -> DefinitionConfigSchema:\n """Converts the config structure represented by this class into a DefinitionConfigSchema."""\n return DefinitionConfigSchema(infer_schema_from_config_class(cls))\n\n @classmethod\n def to_fields_dict(cls) -> Dict[str, DagsterField]:\n """Converts the config structure represented by this class into a dictionary of dagster.Fields.\n This is useful when interacting with legacy code that expects a dictionary of fields but you\n want the source of truth to be a config class.\n """\n return cast(Shape, cls.to_config_schema().as_field().config_type).fields
\n\n\ndef _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key: str, config_dict: Mapping[str, Any]\n):\n """Remaps a config dictionary which is a member of a discriminated union to\n the appropriate structure for a Dagster config selector.\n\n A discriminated union with key "my_key" and value "my_value" will be represented\n as {"my_key": "my_value", "my_field": "my_field_value"}. When converted to a selector,\n this should be represented as {"my_value": {"my_field": "my_field_value"}}.\n """\n updated_dict = dict(config_dict)\n discriminator_value = updated_dict.pop(discriminator_key)\n wrapped_dict = {discriminator_value: updated_dict}\n return wrapped_dict\n\n\ndef _config_value_to_dict_representation(field: Optional[ModelFieldCompat], value: Any):\n """Converts a config value to a dictionary representation. If a field is provided, it will be used\n to determine the appropriate dictionary representation in the case of discriminated unions.\n """\n from dagster._config.field_utils import EnvVar, IntEnvVar\n\n if isinstance(value, dict):\n return {k: _config_value_to_dict_representation(None, v) for k, v in value.items()}\n elif isinstance(value, list):\n return [_config_value_to_dict_representation(None, v) for v in value]\n elif isinstance(value, EnvVar):\n return {"env": value.env_var_name}\n elif isinstance(value, IntEnvVar):\n return {"env": value.name}\n if isinstance(value, Config):\n if field and field.discriminator:\n return {\n k: v\n for k, v in _discriminated_union_config_dict_to_selector_config_dict(\n field.discriminator,\n value._convert_to_config_dictionary(), # noqa: SLF001\n ).items()\n }\n else:\n return {k: v for k, v in value._convert_to_config_dictionary().items()} # noqa: SLF001\n elif isinstance(value, Enum):\n return value.name\n\n return value\n\n\n
[docs]class PermissiveConfig(Config):\n """Subclass of :py:class:`Config` that allows arbitrary extra fields. This is useful for\n config classes which may have open-ended inputs.\n\n Example definition:\n\n .. code-block:: python\n\n class MyPermissiveOpConfig(PermissiveConfig):\n my_explicit_parameter: bool\n my_other_explicit_parameter: str\n\n\n Example usage:\n\n .. code-block:: python\n\n @op\n def op_with_config(config: MyPermissiveOpConfig):\n assert config.my_explicit_parameter == True\n assert config.my_other_explicit_parameter == "foo"\n assert config.dict().get("my_implicit_parameter") == "bar"\n\n op_with_config(\n MyPermissiveOpConfig(\n my_explicit_parameter=True,\n my_other_explicit_parameter="foo",\n my_implicit_parameter="bar"\n )\n )\n\n """\n\n # Pydantic config for this class\n # Cannot use kwargs for base class as this is not support for pydantic<1.8\n class Config:\n extra = "allow"
\n\n\ndef infer_schema_from_config_class(\n model_cls: Type["Config"],\n description: Optional[str] = None,\n fields_to_omit: Optional[Set[str]] = None,\n) -> Field:\n from .config import Config\n from .resource import ConfigurableResourceFactory, _is_annotated_as_resource_type\n\n """Parses a structured config class and returns a corresponding Dagster config Field."""\n fields_to_omit = fields_to_omit or set()\n\n check.param_invariant(\n safe_is_subclass(model_cls, Config),\n "Config type annotation must inherit from dagster.Config",\n )\n\n fields: Dict[str, Field] = {}\n for key, pydantic_field_info in model_fields(model_cls).items():\n if _is_annotated_as_resource_type(\n pydantic_field_info.annotation, pydantic_field_info.metadata\n ):\n continue\n\n alias = pydantic_field_info.alias if pydantic_field_info.alias else key\n if key not in fields_to_omit:\n if isinstance(pydantic_field_info.default, Field):\n raise DagsterInvalidDefinitionError(\n "Using 'dagster.Field' is not supported within a Pythonic config or resource"\n " definition. 'dagster.Field' should only be used in legacy Dagster config"\n " schemas. Did you mean to use 'pydantic.Field' instead?"\n )\n\n try:\n fields[alias] = _convert_pydantic_field(pydantic_field_info)\n except DagsterInvalidConfigDefinitionError as e:\n raise DagsterInvalidPythonicConfigDefinitionError(\n config_class=model_cls,\n field_name=key,\n invalid_type=e.current_value,\n is_resource=model_cls is not None\n and safe_is_subclass(model_cls, ConfigurableResourceFactory),\n )\n\n shape_cls = Permissive if model_config(model_cls).get("extra") == "allow" else Shape\n\n docstring = model_cls.__doc__.strip() if model_cls.__doc__ else None\n\n return Field(config=shape_cls(fields), description=description or docstring)\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.config"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.io_manager

\nfrom abc import abstractmethod\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Mapping,\n    Optional,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeVar\n\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n)\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n)\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .config import Config\nfrom .conversion_utils import TResValue\nfrom .resource import (\n    AllowDelayedDependencies,\n    ConfigurableResourceFactory,\n    PartialResource,\n    ResourceId,\n    ResourceWithKeyMapping,\n    Self,\n)\nfrom .type_check_utils import safe_is_subclass\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nTIOManagerValue = TypeVar("TIOManagerValue", bound=IOManager)\n\n\nclass ConfigurableIOManagerFactoryResourceDefinition(IOManagerDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        input_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        output_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        dagster_maintained: bool = False,\n    ):\n        input_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], input_config_schema).to_config_schema()\n            if safe_is_subclass(input_config_schema, Config)\n            else cast(CoercableToConfigSchema, input_config_schema)\n        )\n        output_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], output_config_schema).to_config_schema()\n            if safe_is_subclass(output_config_schema, Config)\n            else cast(CoercableToConfigSchema, output_config_schema)\n        )\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n            input_config_schema=input_config_schema_resolved,\n            output_config_schema=output_config_schema_resolved,\n        )\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._configurable_resource_cls = configurable_resource_cls\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n\nclass IOManagerWithKeyMapping(ResourceWithKeyMapping, IOManagerDefinition):\n    """Version of ResourceWithKeyMapping wrapper that also implements IOManagerDefinition."""\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        ResourceWithKeyMapping.__init__(self, resource, resource_id_to_key_mapping)\n        IOManagerDefinition.__init__(\n            self, resource_fn=self.resource_fn, config_schema=resource.config_schema\n        )\n\n\n
[docs]class ConfigurableIOManagerFactory(ConfigurableResourceFactory[TIOManagerValue]):\n """Base class for Dagster IO managers that utilize structured config. This base class\n is useful for cases in which the returned IO manager is not the same as the class itself\n (e.g. when it is a wrapper around the actual IO manager implementation).\n\n This class is a subclass of both :py:class:`IOManagerDefinition` and :py:class:`Config`.\n Implementers should provide an implementation of the :py:meth:`resource_function` method,\n which should return an instance of :py:class:`IOManager`.\n\n\n Example definition:\n\n .. code-block:: python\n\n class ExternalIOManager(IOManager):\n\n def __init__(self, connection):\n self._connection = connection\n\n def handle_output(self, context, obj):\n ...\n\n def load_input(self, context):\n ...\n\n class ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n username: str\n password: str\n\n def create_io_manager(self, context) -> IOManager:\n with database.connect(username, password) as connection:\n return MyExternalIOManager(connection)\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": ConfigurableExternalIOManager(\n username="dagster",\n password=EnvVar("DB_PASSWORD")\n )\n }\n )\n\n """\n\n def __init__(self, **data: Any):\n ConfigurableResourceFactory.__init__(self, **data)\n\n @abstractmethod\n def create_io_manager(self, context) -> TIOManagerValue:\n """Implement as one would implement a @io_manager decorator function."""\n raise NotImplementedError()\n\n def create_resource(self, context: InitResourceContext) -> TIOManagerValue:\n return self.create_io_manager(context)\n\n @classmethod\n def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialIOManager[Self]":\n """Returns a partially initialized copy of the IO manager, with remaining config fields\n set at runtime.\n """\n return PartialIOManager(cls, data=kwargs)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self._get_initialize_and_run_fn(),\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n input_config_schema=self.__class__.input_config_schema(),\n output_config_schema=self.__class__.output_config_schema(),\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n @classmethod\n def input_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None\n\n @classmethod\n def output_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None
\n\n\nclass PartialIOManager(Generic[TResValue], PartialResource[TResValue]):\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n PartialResource.__init__(self, resource_cls, data)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n input_config_schema = None\n output_config_schema = None\n if safe_is_subclass(self.resource_cls, ConfigurableIOManagerFactory):\n factory_cls: Type[ConfigurableIOManagerFactory] = cast(\n Type[ConfigurableIOManagerFactory], self.resource_cls\n )\n input_config_schema = factory_cls.input_config_schema()\n output_config_schema = factory_cls.output_config_schema()\n\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self._state__internal__.nested_resources,\n input_config_schema=input_config_schema,\n output_config_schema=output_config_schema,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\n
[docs]class ConfigurableIOManager(ConfigurableIOManagerFactory, IOManager):\n """Base class for Dagster IO managers that utilize structured config.\n\n This class is a subclass of both :py:class:`IOManagerDefinition`, :py:class:`Config`,\n and :py:class:`IOManager`. Implementers must provide an implementation of the\n :py:meth:`handle_output` and :py:meth:`load_input` methods.\n\n Example definition:\n\n .. code-block:: python\n\n class MyIOManager(ConfigurableIOManager):\n path_prefix: List[str]\n\n def _get_path(self, context) -> str:\n return "/".join(context.asset_key.path)\n\n def handle_output(self, context, obj):\n write_csv(self._get_path(context), obj)\n\n def load_input(self, context):\n return read_csv(self._get_path(context))\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n }\n )\n\n """\n\n def create_io_manager(self, context) -> IOManager:\n return self
\n\n\nclass ConfigurableLegacyIOManagerAdapter(ConfigurableIOManagerFactory):\n """Adapter base class for wrapping a decorated, function-style I/O manager\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_io_manager`` method.\n\n Example:\n .. code-block:: python\n\n class OldIOManager(IOManager):\n def __init__(self, base_path: str):\n ...\n\n @io_manager(config_schema={"base_path": str})\n def old_io_manager(context):\n base_path = context.resource_config["base_path"]\n\n return OldIOManager(base_path)\n\n class MyIOManager(ConfigurableLegacyIOManagerAdapter):\n base_path: str\n\n @property\n def wrapped_io_manager(self) -> IOManagerDefinition:\n return old_io_manager\n """\n\n @property\n @abstractmethod\n def wrapped_io_manager(self) -> IOManagerDefinition:\n raise NotImplementedError()\n\n def create_io_manager(self, context) -> IOManager:\n raise NotImplementedError(\n "Because we override resource_fn in the adapter, this is never called."\n )\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_io_manager.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.resource

\nimport contextlib\nimport inspect\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeGuard, get_args, get_origin\n\nfrom dagster import (\n    Field as DagsterField,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.field_utils import config_dictionary_from_values\nfrom dagster._config.pythonic_config.typing_utils import (\n    TypecheckAllowPartialResourceInitParams,\n)\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.context.init import InitResourceContext, build_init_resource_context\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nfrom abc import ABC, abstractmethod\n\nfrom pydantic import BaseModel\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n    has_at_least_one_parameter,\n)\nfrom dagster._core.storage.io_manager import IOManagerDefinition\n\nfrom .config import Config, MakeConfigCacheable, infer_schema_from_config_class\nfrom .conversion_utils import (\n    TResValue,\n    _curry_config_schema,\n)\nfrom .typing_utils import BaseResourceMeta, LateBoundTypesForResourceTypeChecking\n\nSelf = TypeVar("Self", bound="ConfigurableResourceFactory")\nResourceId: TypeAlias = int\n\n\nclass AllowDelayedDependencies:\n    _nested_partial_resources: Mapping[str, ResourceDefinition] = {}\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        # All dependent resources which are not fully configured\n        # must be specified to the Definitions object so that the\n        # resource can be configured at runtime by the user\n        nested_partial_resource_keys = {\n            attr_name: resource_mapping.get(id(resource_def))\n            for attr_name, resource_def in self._nested_partial_resources.items()\n        }\n        check.invariant(\n            all(pointer_key is not None for pointer_key in nested_partial_resource_keys.values()),\n            "Any partially configured, nested resources must be provided to Definitions"\n            f" object: {nested_partial_resource_keys}",\n        )\n\n        # Recursively get all nested resource keys\n        nested_resource_required_keys: Set[str] = set()\n        for v in self._nested_partial_resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(v, resource_mapping)\n            )\n\n        resources, _ = separate_resource_params(\n            cast(Type[BaseModel], self.__class__), self.__dict__\n        )\n        for v in resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(\n                    wrap_resource_for_execution(v), resource_mapping\n                )\n            )\n\n        out = set(cast(Set[str], nested_partial_resource_keys.values())).union(\n            nested_resource_required_keys\n        )\n        return out\n\n\nclass InitResourceContextWithKeyMapping(InitResourceContext):\n    """Passes along a mapping from ResourceDefinition id to resource key alongside the\n    InitResourceContext. This is used to resolve the required resource keys for\n    resources which may hold nested partial resources.\n    """\n\n    def __init__(\n        self,\n        context: InitResourceContext,\n        resource_id_to_key_mapping: Mapping[ResourceId, str],\n    ):\n        super().__init__(\n            resource_config=context.resource_config,\n            resources=context.resources,\n            instance=context.instance,\n            resource_def=context.resource_def,\n            dagster_run=context.dagster_run,\n            log_manager=context.log,\n        )\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n        self._resources_by_id = {\n            resource_id: getattr(context.resources, resource_key, None)\n            for resource_id, resource_key in resource_id_to_key_mapping.items()\n        }\n\n    @property\n    def resources_by_id(self) -> Mapping[ResourceId, Any]:\n        return self._resources_by_id\n\n    def replace_config(self, config: Any) -> "InitResourceContext":\n        return InitResourceContextWithKeyMapping(\n            super().replace_config(config), self._resource_id_to_key_mapping\n        )\n\n\nclass ResourceWithKeyMapping(ResourceDefinition):\n    """Wrapper around a ResourceDefinition which helps the inner resource resolve its required\n    resource keys. This is useful for resources which may hold nested resources. At construction\n    time, they are unaware of the resource keys of their nested resources - the resource id to\n    key mapping is used to resolve this.\n    """\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        self._resource = resource\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n\n        ResourceDefinition.__init__(\n            self,\n            resource_fn=self.setup_context_resources_and_call,\n            config_schema=resource.config_schema,\n            description=resource.description,\n            version=resource.version,\n        )\n\n    def setup_context_resources_and_call(self, context: InitResourceContext):\n        """Wrapper around the wrapped resource's resource_fn which attaches its\n        resource id to key mapping to the context, and then calls the nested resource's resource_fn.\n        """\n        context_with_key_mapping = InitResourceContextWithKeyMapping(\n            context, self._resource_id_to_key_mapping\n        )\n\n        if has_at_least_one_parameter(self._resource.resource_fn):\n            return self._resource.resource_fn(context_with_key_mapping)\n        else:\n            return cast(ResourceFunctionWithoutContext, self._resource.resource_fn)()\n\n    @property\n    def required_resource_keys(self) -> AbstractSet[str]:\n        return _resolve_required_resource_keys_for_resource(\n            self._resource, self._resource_id_to_key_mapping\n        )\n\n    @property\n    def wrapped_resource(self) -> ResourceDefinition:\n        return self._resource\n\n    @property\n    def inner_resource(self):\n        return self._resource\n\n\ndef attach_resource_id_to_key_mapping(\n    resource_def: Any, resource_id_to_key_mapping: Dict[ResourceId, str]\n) -> Any:\n    from .io_manager import IOManagerWithKeyMapping\n\n    if isinstance(resource_def, (ConfigurableResourceFactory, PartialResource)):\n        defn = resource_def.get_resource_definition()\n        return (\n            IOManagerWithKeyMapping(defn, resource_id_to_key_mapping)\n            if isinstance(defn, IOManagerDefinition)\n            else ResourceWithKeyMapping(defn, resource_id_to_key_mapping)\n        )\n    return resource_def\n\n\nCoercibleToResource: TypeAlias = Union[\n    ResourceDefinition, "ConfigurableResourceFactory", "PartialResource"\n]\n\n\ndef is_coercible_to_resource(val: Any) -> TypeGuard[CoercibleToResource]:\n    return isinstance(val, (ResourceDefinition, ConfigurableResourceFactory, PartialResource))\n\n\nclass ConfigurableResourceFactoryResourceDefinition(ResourceDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        dagster_maintained: bool = False,\n    ):\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n        )\n        self._configurable_resource_cls = configurable_resource_cls\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n    def _is_dagster_maintained(self) -> bool:\n        return self._dagster_maintained\n\n\nclass ConfigurableResourceFactoryState(NamedTuple):\n    nested_partial_resources: Mapping[str, Any]\n    resolved_config_dict: Dict[str, Any]\n    config_schema: DefinitionConfigSchema\n    schema: DagsterField\n    nested_resources: Dict[str, Any]\n    resource_context: Optional[InitResourceContext]\n\n\nclass ConfigurableResourceFactory(\n    Generic[TResValue],\n    Config,\n    TypecheckAllowPartialResourceInitParams,\n    AllowDelayedDependencies,\n    ABC,\n    metaclass=BaseResourceMeta,\n):\n    """Base class for creating and managing the lifecycle of Dagster resources that utilize structured config.\n\n    Users should directly inherit from this class when they want the object passed to user-defined\n    code (such as an asset or op) to be different than the object that defines the configuration\n    schema and is passed to the :py:class:`Definitions` object. Cases where this is useful include is\n    when the object passed to user code is:\n\n    * An existing class from a third-party library that the user does not control.\n    * A complex class that requires substantial internal state management or itself requires arguments beyond its config values.\n    * A class with expensive initialization that should not be invoked on code location load, but rather lazily on first use in an op or asset during a run.\n    * A class that you desire to be a plain Python class, rather than a Pydantic class, for whatever reason.\n\n    This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`, and\n    must implement ``create_resource``, which creates the resource to pass to user code.\n\n    Example definition:\n\n    .. code-block:: python\n\n        class DatabaseResource(ConfigurableResourceFactory[Database]):\n            connection_uri: str\n\n            def create_resource(self, _init_context) -> Database:\n                # For example Database could be from a third-party library or require expensive setup.\n                # Or you could just prefer to separate the concerns of configuration and runtime representation\n                return Database(self.connection_uri)\n\n    To use a resource created by a factory in a job, you must use the Resource type annotation.\n\n    Example usage:\n\n\n    .. code-block:: python\n\n        @asset\n        def asset_that_uses_database(database: ResourceParam[Database]):\n            # Database used directly in user code\n            database.query("SELECT * FROM table")\n\n        defs = Definitions(\n            assets=[asset_that_uses_database],\n            resources={"database": DatabaseResource(connection_uri="some_uri")},\n        )\n\n    """\n\n    def __init__(self, **data: Any):\n        resource_pointers, data_without_resources = separate_resource_params(self.__class__, data)\n\n        schema = infer_schema_from_config_class(\n            self.__class__, fields_to_omit=set(resource_pointers.keys())\n        )\n\n        # Populate config values\n        Config.__init__(self, **{**data_without_resources, **resource_pointers})\n\n        # We pull the values from the Pydantic config object, which may cast values\n        # to the correct type under the hood - useful in particular for enums\n        casted_data_without_resources = {\n            k: v\n            for k, v in self._convert_to_config_dictionary().items()\n            if k in data_without_resources\n        }\n        resolved_config_dict = config_dictionary_from_values(casted_data_without_resources, schema)\n\n        self._state__internal__ = ConfigurableResourceFactoryState(\n            # We keep track of any resources we depend on which are not fully configured\n            # so that we can retrieve them at runtime\n            nested_partial_resources={\n                k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n            },\n            resolved_config_dict=resolved_config_dict,\n            # These are unfortunately named very similarily\n            config_schema=_curry_config_schema(schema, resolved_config_dict),\n            schema=schema,\n            nested_resources={k: v for k, v in resource_pointers.items()},\n            resource_context=None,\n        )\n\n    @property\n    def _schema(self):\n        return self._state__internal__.schema\n\n    @property\n    def _config_schema(self):\n        return self._state__internal__.config_schema\n\n    @property\n    def _nested_partial_resources(self):\n        return self._state__internal__.nested_partial_resources\n\n    @property\n    def _nested_resources(self):\n        return self._state__internal__.nested_resources\n\n    @property\n    def _resolved_config_dict(self):\n        return self._state__internal__.resolved_config_dict\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        """This should be overridden to return True by all dagster maintained resources and IO managers."""\n        return False\n\n    @classmethod\n    def _is_cm_resource_cls(cls: Type["ConfigurableResourceFactory"]) -> bool:\n        return (\n            cls.yield_for_execution != ConfigurableResourceFactory.yield_for_execution\n            or cls.teardown_after_execution != ConfigurableResourceFactory.teardown_after_execution\n        )\n\n    @property\n    def _is_cm_resource(self) -> bool:\n        return self.__class__._is_cm_resource_cls()  # noqa: SLF001\n\n    def _get_initialize_and_run_fn(self) -> Callable:\n        return self._initialize_and_run_cm if self._is_cm_resource else self._initialize_and_run\n\n    @cached_method\n    def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n        return ConfigurableResourceFactoryResourceDefinition(\n            self.__class__,\n            resource_fn=self._get_initialize_and_run_fn(),\n            config_schema=self._config_schema,\n            description=self.__doc__,\n            resolve_resource_keys=self._resolve_required_resource_keys,\n            nested_resources=self.nested_resources,\n            dagster_maintained=self._is_dagster_maintained(),\n        )\n\n    @abstractmethod\n    def create_resource(self, context: InitResourceContext) -> TResValue:\n        """Returns the object that this resource hands to user code, accessible by ops or assets\n        through the context or resource parameters. This works like the function decorated\n        with @resource when using function-based resources.\n        """\n        raise NotImplementedError()\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    @classmethod\n    def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialResource[Self]":\n        """Returns a partially initialized copy of the resource, with remaining config fields\n        set at runtime.\n        """\n        return PartialResource(cls, data=kwargs)\n\n    def _with_updated_values(\n        self, values: Optional[Mapping[str, Any]]\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given values.\n        Used when initializing a resource at runtime.\n        """\n        values = check.opt_mapping_param(values, "values", key_type=str)\n        # Since Resource extends BaseModel and is a dataclass, we know that the\n        # signature of any __init__ method will always consist of the fields\n        # of this class. We can therefore safely pass in the values as kwargs.\n        out = self.__class__(**{**self._get_non_none_public_field_values(), **values})\n        out._state__internal__ = out._state__internal__._replace(  # noqa: SLF001\n            resource_context=self._state__internal__.resource_context\n        )\n        return out\n\n    @contextlib.contextmanager\n    def _resolve_and_update_nested_resources(\n        self, context: InitResourceContext\n    ) -> Generator["ConfigurableResourceFactory[TResValue]", None, None]:\n        """Updates any nested resources with the resource values from the context.\n        In this case, populating partially configured resources or\n        resources that return plain Python types.\n\n        Returns a new instance of the resource.\n        """\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        partial_resources_to_update: Dict[str, Any] = {}\n        if self._nested_partial_resources:\n            context_with_mapping = cast(\n                InitResourceContextWithKeyMapping,\n                check.inst(\n                    context,\n                    InitResourceContextWithKeyMapping,\n                    "This ConfiguredResource contains unresolved partially-specified nested"\n                    " resources, and so can only be initialized using a"\n                    " InitResourceContextWithKeyMapping",\n                ),\n            )\n            partial_resources_to_update = {\n                attr_name: context_with_mapping.resources_by_id[id(resource)]\n                for attr_name, resource in self._nested_partial_resources.items()\n            }\n\n        # Also evaluate any resources that are not partial\n        with contextlib.ExitStack() as stack:\n            resources_to_update, _ = separate_resource_params(self.__class__, self.__dict__)\n            resources_to_update = {\n                attr_name: _call_resource_fn_with_default(\n                    stack, wrap_resource_for_execution(resource), context\n                )\n                for attr_name, resource in resources_to_update.items()\n                if attr_name not in partial_resources_to_update\n            }\n\n            to_update = {**resources_to_update, **partial_resources_to_update}\n            yield self._with_updated_values(to_update)\n\n    @deprecated(\n        breaking_version="2.0", additional_warn_text="Use `with_replaced_resource_context` instead"\n    )\n    def with_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        return self.with_replaced_resource_context(resource_context)\n\n    def with_replaced_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given resource init context bound."""\n        # This utility is used to create a copy of this resource, without adjusting\n        # any values in this case\n        copy = self._with_updated_values({})\n        copy._state__internal__ = copy._state__internal__._replace(  # noqa: SLF001\n            resource_context=resource_context\n        )\n        return copy\n\n    def _initialize_and_run(self, context: InitResourceContext) -> TResValue:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            updated_resource.setup_for_execution(context)\n            return updated_resource.create_resource(context)\n\n    @contextlib.contextmanager\n    def _initialize_and_run_cm(\n        self, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            with updated_resource.yield_for_execution(context) as value:\n                yield value\n\n    def setup_for_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any pre-execution steps\n        needed before the resource is used in execution.\n        """\n        pass\n\n    def teardown_after_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any post-execution steps\n        needed after the resource is used in execution.\n\n        teardown_after_execution will be called even if any part of the run fails.\n        It will not be called if setup_for_execution fails.\n        """\n        pass\n\n    @contextlib.contextmanager\n    def yield_for_execution(self, context: InitResourceContext) -> Generator[TResValue, None, None]:\n        """Optionally override this method to perform any lifecycle steps\n        before or after the resource is used in execution. By default, calls\n        setup_for_execution before yielding, and teardown_after_execution after yielding.\n\n        Note that if you override this method and want setup_for_execution or\n        teardown_after_execution to be called, you must invoke them yourself.\n        """\n        self.setup_for_execution(context)\n        try:\n            yield self.create_resource(context)\n        finally:\n            self.teardown_after_execution(context)\n\n    def get_resource_context(self) -> InitResourceContext:\n        """Returns the context that this resource was initialized with."""\n        return check.not_none(\n            self._state__internal__.resource_context,\n            additional_message="Attempted to get context before resource was initialized.",\n        )\n\n    def process_config_and_initialize(self) -> TResValue:\n        """Initializes this resource, fully processing its config and returning the prepared\n        resource value.\n        """\n        from dagster._config.post_process import post_process_config\n\n        return self.from_resource_context(\n            build_init_resource_context(\n                config=post_process_config(\n                    self._config_schema.config_type, self._convert_to_config_dictionary()\n                ).value\n            )\n        )\n\n    @classmethod\n    def from_resource_context(cls, context: InitResourceContext) -> TResValue:\n        """Creates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes.\n\n        For resources that have custom teardown behavior, use from_resource_context_cm instead.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> MyResource:\n                return MyResource.from_resource_context(context)\n\n        """\n        check.invariant(\n            not cls._is_cm_resource_cls(),\n            "Use from_resource_context_cm for resources which have custom teardown behavior,"\n            " e.g. overriding yield_for_execution or teardown_after_execution",\n        )\n        return cls(**context.resource_config or {})._initialize_and_run(context)  # noqa: SLF001\n\n    @classmethod\n    @contextlib.contextmanager\n    def from_resource_context_cm(\n        cls, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        """Context which generates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes. Handles custom teardown behavior.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> Generator[MyResource, None, None]:\n                with MyResource.from_resource_context_cm(context) as my_resource:\n                    yield my_resource\n\n        """\n        with cls(**context.resource_config or {})._initialize_and_run_cm(  # noqa: SLF001\n            context\n        ) as value:\n            yield value\n\n\n
[docs]class ConfigurableResource(ConfigurableResourceFactory[TResValue]):\n """Base class for Dagster resources that utilize structured config.\n\n This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`.\n\n Example definition:\n\n .. code-block:: python\n\n class WriterResource(ConfigurableResource):\n prefix: str\n\n def output(self, text: str) -> None:\n print(f"{self.prefix}{text}")\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_that_uses_writer(writer: WriterResource):\n writer.output("text")\n\n defs = Definitions(\n assets=[asset_that_uses_writer],\n resources={"writer": WriterResource(prefix="a_prefix")},\n )\n\n """\n\n def create_resource(self, context: InitResourceContext) -> TResValue:\n """Returns the object that this resource hands to user code, accessible by ops or assets\n through the context or resource parameters. This works like the function decorated\n with @resource when using function-based resources.\n\n For ConfigurableResource, this function will return itself, passing\n the actual ConfigurableResource object to user code.\n """\n return cast(TResValue, self)
\n\n\ndef _is_fully_configured(resource: CoercibleToResource) -> bool:\n from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n actual_resource = wrap_resource_for_execution(resource)\n res = (\n validate_config(\n actual_resource.config_schema.config_type,\n (\n actual_resource.config_schema.default_value\n if actual_resource.config_schema.default_provided\n else {}\n ),\n ).success\n is True\n )\n\n return res\n\n\nclass PartialResourceState(NamedTuple):\n nested_partial_resources: Dict[str, Any]\n config_schema: DagsterField\n resource_fn: Callable[[InitResourceContext], Any]\n description: Optional[str]\n nested_resources: Dict[str, Any]\n\n\nclass PartialResource(Generic[TResValue], AllowDelayedDependencies, MakeConfigCacheable):\n data: Dict[str, Any]\n resource_cls: Type[ConfigurableResourceFactory[TResValue]]\n\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n resource_pointers, _data_without_resources = separate_resource_params(resource_cls, data)\n\n MakeConfigCacheable.__init__(self, data=data, resource_cls=resource_cls) # type: ignore # extends BaseModel, takes kwargs\n\n def resource_fn(context: InitResourceContext):\n instantiated = resource_cls(\n **{**data, **context.resource_config}\n ) # So that collisions are resolved in favor of the latest provided run config\n return instantiated._get_initialize_and_run_fn()(context) # noqa: SLF001\n\n self._state__internal__ = PartialResourceState(\n # We keep track of any resources we depend on which are not fully configured\n # so that we can retrieve them at runtime\n nested_partial_resources={\n k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n },\n config_schema=infer_schema_from_config_class(\n resource_cls, fields_to_omit=set(resource_pointers.keys())\n ),\n resource_fn=resource_fn,\n description=resource_cls.__doc__,\n nested_resources={k: v for k, v in resource_pointers.items()},\n )\n\n # to make AllowDelayedDependencies work\n @property\n def _nested_partial_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_partial_resources\n\n @property\n def nested_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_resources\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\nResourceOrPartial: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue], PartialResource[TResValue]\n]\nResourceOrPartialOrValue: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue],\n PartialResource[TResValue],\n ResourceDefinition,\n TResValue,\n]\n\n\nV = TypeVar("V")\n\n\nclass ResourceDependency(Generic[V]):\n def __set_name__(self, _owner, name):\n self._name = name\n\n def __get__(self, obj: "ConfigurableResourceFactory", __owner: Any) -> V:\n return getattr(obj, self._name)\n\n def __set__(self, obj: Optional[object], value: ResourceOrPartialOrValue[V]) -> None:\n setattr(obj, self._name, value)\n\n\nclass ConfigurableLegacyResourceAdapter(ConfigurableResource, ABC):\n """Adapter base class for wrapping a decorated, function-style resource\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_resource`` method.\n\n Example:\n .. code-block:: python\n\n @resource(config_schema={"prefix": str})\n def writer_resource(context):\n prefix = context.resource_config["prefix"]\n\n def output(text: str) -> None:\n out_txt.append(f"{prefix}{text}")\n\n return output\n\n class WriterResource(ConfigurableLegacyResourceAdapter):\n prefix: str\n\n @property\n def wrapped_resource(self) -> ResourceDefinition:\n return writer_resource\n """\n\n @property\n @abstractmethod\n def wrapped_resource(self) -> ResourceDefinition:\n raise NotImplementedError()\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_resource.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n def __call__(self, *args, **kwargs):\n return self.wrapped_resource(*args, **kwargs)\n\n\nclass SeparatedResourceParams(NamedTuple):\n resources: Dict[str, Any]\n non_resources: Dict[str, Any]\n\n\ndef _is_annotated_as_resource_type(annotation: Type, metadata: List[str]) -> bool:\n """Determines if a field in a structured config class is annotated as a resource type or not."""\n from .type_check_utils import safe_is_subclass\n\n if metadata and metadata[0] == "resource_dependency":\n return True\n\n is_annotated_as_resource_dependency = get_origin(annotation) == ResourceDependency or getattr(\n annotation, "__metadata__", None\n ) == ("resource_dependency",)\n\n return is_annotated_as_resource_dependency or safe_is_subclass(\n annotation, (ResourceDefinition, ConfigurableResourceFactory)\n )\n\n\nclass ResourceDataWithAnnotation(NamedTuple):\n key: str\n value: Any\n annotation: Type\n annotation_metadata: List[str]\n\n\ndef separate_resource_params(cls: Type[BaseModel], data: Dict[str, Any]) -> SeparatedResourceParams:\n """Separates out the key/value inputs of fields in a structured config Resource class which\n are marked as resources (ie, using ResourceDependency) from those which are not.\n """\n keys_by_alias = {\n field.alias if field.alias else key: field for key, field in model_fields(cls).items()\n }\n data_with_annotation: List[ResourceDataWithAnnotation] = [\n # No longer exists in Pydantic 2.x, will need to be updated when we upgrade\n ResourceDataWithAnnotation(\n key=field_key,\n value=field_value,\n annotation=keys_by_alias[field_key].outer_type_,\n annotation_metadata=keys_by_alias[field_key].metadata,\n )\n for field_key, field_value in data.items()\n if field_key in keys_by_alias\n ]\n # We need to grab metadata from the annotation in order to tell if\n # this key was annotated with a typing.Annotated annotation (which we use for resource/resource deps),\n # since Pydantic 2.0 strips that info out and sticks any Annotated metadata in the\n # metadata field\n out = SeparatedResourceParams(\n resources={\n d.key: d.value\n for d in data_with_annotation\n if _is_annotated_as_resource_type(\n d.annotation,\n d.annotation_metadata,\n )\n },\n non_resources={\n d.key: d.value\n for d in data_with_annotation\n if not _is_annotated_as_resource_type(\n d.annotation,\n d.annotation_metadata,\n )\n },\n )\n return out\n\n\ndef _call_resource_fn_with_default(\n stack: contextlib.ExitStack, obj: ResourceDefinition, context: InitResourceContext\n) -> Any:\n from dagster._config.validate import process_config\n\n if isinstance(obj.config_schema, ConfiguredDefinitionConfigSchema):\n value = cast(Dict[str, Any], obj.config_schema.resolve_config({}).value)\n context = context.replace_config(value["config"])\n elif obj.config_schema.default_provided:\n # To explain why we need to process config here;\n # - The resource available on the init context (context.resource_config) has already been processed\n # - The nested resource's config has also already been processed, but is only available in the broader run config dictionary.\n # - The only information we have access to here is the unprocessed default value, so we need to process it a second time.\n unprocessed_config = obj.config_schema.default_value\n evr = process_config(\n {"config": obj.config_schema.config_type}, {"config": unprocessed_config}\n )\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Error in config for nested resource ",\n evr.errors,\n unprocessed_config,\n )\n context = context.replace_config(cast(dict, evr.value)["config"])\n\n if has_at_least_one_parameter(obj.resource_fn):\n result = cast(ResourceFunctionWithContext, obj.resource_fn)(context)\n else:\n result = cast(ResourceFunctionWithoutContext, obj.resource_fn)()\n\n is_fn_generator = inspect.isgenerator(obj.resource_fn) or isinstance(\n obj.resource_fn, contextlib.ContextDecorator\n )\n if is_fn_generator:\n return stack.enter_context(cast(contextlib.AbstractContextManager, result))\n else:\n return result\n\n\nLateBoundTypesForResourceTypeChecking.set_actual_types_for_type_checking(\n resource_dep_type=ResourceDependency,\n resource_type=ConfigurableResourceFactory,\n partial_resource_type=PartialResource,\n)\n\n\ndef validate_resource_annotated_function(fn) -> None:\n """Validates any parameters on the decorated function that are annotated with\n :py:class:`dagster.ResourceDefinition`, raising a :py:class:`dagster.DagsterInvalidDefinitionError`\n if any are not also instances of :py:class:`dagster.ConfigurableResource` (these resources should\n instead be wrapped in the :py:func:`dagster.Resource` Annotation).\n """\n from dagster import DagsterInvalidDefinitionError\n from dagster._config.pythonic_config.resource import (\n ConfigurableResource,\n ConfigurableResourceFactory,\n TResValue,\n )\n\n from .type_check_utils import safe_is_subclass\n\n malformed_params = [\n param\n for param in get_function_params(fn)\n if safe_is_subclass(param.annotation, (ResourceDefinition, ConfigurableResourceFactory))\n and not safe_is_subclass(param.annotation, ConfigurableResource)\n ]\n if len(malformed_params) > 0:\n malformed_param = malformed_params[0]\n output_type = None\n if safe_is_subclass(malformed_param.annotation, ConfigurableResourceFactory):\n orig_bases = getattr(malformed_param.annotation, "__orig_bases__", None)\n output_type = get_args(orig_bases[0])[0] if orig_bases and len(orig_bases) > 0 else None\n if output_type == TResValue:\n output_type = None\n\n output_type_name = getattr(output_type, "__name__", str(output_type))\n raise DagsterInvalidDefinitionError(\n """Resource param '{param_name}' is annotated as '{annotation_type}', but '{annotation_type}' outputs {value_message} value to user code such as @ops and @assets. This annotation should instead be {annotation_suggestion}""".format(\n param_name=malformed_param.name,\n annotation_type=malformed_param.annotation,\n value_message=f"a '{output_type}'" if output_type else "an unknown",\n annotation_suggestion=(\n f"'ResourceParam[{output_type_name}]'"\n if output_type\n else "'ResourceParam[Any]' or 'ResourceParam[<output type>]'"\n ),\n )\n )\n\n\ndef _resolve_required_resource_keys_for_resource(\n resource: ResourceDefinition, resource_id_to_key_mapping: Mapping[ResourceId, str]\n) -> AbstractSet[str]:\n """Gets the required resource keys for the provided resource, with the assistance of the passed\n resource-id-to-key mapping. For resources which may hold nested partial resources,\n this mapping is used to obtain the top-level resource keys to depend on.\n """\n if isinstance(resource, AllowDelayedDependencies):\n return resource._resolve_required_resource_keys(resource_id_to_key_mapping) # noqa: SLF001\n return resource.required_resource_keys\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.resource"}}, "source": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.source

\nimport os\n\nimport dagster._check as check\n\nfrom .config_type import ScalarUnion\nfrom .errors import PostProcessingError\nfrom .field_utils import Selector\n\nVALID_STRING_SOURCE_TYPES = (str, dict)\n\n\ndef _ensure_env_variable(var):\n    check.str_param(var, "var")\n    value = os.getenv(var)\n    if value is None:\n        raise PostProcessingError(\n            f'You have attempted to fetch the environment variable "{var}" '\n            "which is not set. In order for this execution to succeed it "\n            "must be set in this environment."\n        )\n    return value\n\n\nclass StringSourceType(ScalarUnion):\n    def __init__(self):\n        super(StringSourceType, self).__init__(\n            scalar_type=str,\n            non_scalar_schema=Selector({"env": str}),\n            _key="StringSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, VALID_STRING_SOURCE_TYPES), "value")\n\n        if not isinstance(value, dict):\n            return value\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        return str(_ensure_env_variable(cfg))\n\n\nclass IntSourceType(ScalarUnion):\n    def __init__(self):\n        super(IntSourceType, self).__init__(\n            scalar_type=int,\n            non_scalar_schema=Selector({"env": str}),\n            _key="IntSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, int)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return int(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                f'Value "{value}" stored in env variable "{cfg}" cannot be coerced into an int.'\n            ) from e\n\n\nclass BoolSourceType(ScalarUnion):\n    def __init__(self):\n        super(BoolSourceType, self).__init__(\n            scalar_type=bool,\n            non_scalar_schema=Selector({"env": str}),\n            _key="BoolSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, bool)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return bool(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                (\n                    'Value "{value}" stored in env variable "{var}" cannot be coerced into an bool.'\n                ).format(value=value, var=cfg)\n            ) from e\n\n\nStringSource: StringSourceType = StringSourceType()\nIntSource: IntSourceType = IntSourceType()\nBoolSource: BoolSourceType = BoolSourceType()\n
", "current_page_name": "_modules/dagster/_config/source", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.source"}}, "_core": {"definitions": {"asset_check_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_result

\nfrom typing import TYPE_CHECKING, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationTargetMaterializationData,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.compute import StepExecutionContext\n\n\n
[docs]@experimental\nclass AssetCheckResult(\n NamedTuple(\n "_AssetCheckResult",\n [\n ("passed", PublicAttr[bool]),\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("check_name", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("severity", PublicAttr[AssetCheckSeverity]),\n ],\n )\n):\n """The result of an asset check.\n\n Attributes:\n asset_key (Optional[AssetKey]):\n The asset key that was checked.\n check_name (Optional[str]):\n The name of the check.\n passed (bool):\n The pass/fail result of the check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n severity (AssetCheckSeverity):\n Severity of the check. Defaults to ERROR.\n\n """\n\n def __new__(\n cls,\n *,\n passed: bool,\n asset_key: Optional[CoercibleToAssetKey] = None,\n check_name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n severity: AssetCheckSeverity = AssetCheckSeverity.ERROR,\n ):\n normalized_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n return super().__new__(\n cls,\n asset_key=AssetKey.from_coercible(asset_key) if asset_key is not None else None,\n check_name=check.opt_str_param(check_name, "check_name"),\n passed=check.bool_param(passed, "passed"),\n metadata=normalized_metadata,\n severity=check.inst_param(severity, "severity", AssetCheckSeverity),\n )\n\n def to_asset_check_evaluation(\n self, step_context: "StepExecutionContext"\n ) -> AssetCheckEvaluation:\n spec_check_names_by_asset_key = (\n step_context.job_def.asset_layer.get_check_names_by_asset_key_for_node_handle(\n step_context.node_handle.root\n )\n )\n\n asset_keys_with_specs = spec_check_names_by_asset_key.keys()\n\n if self.asset_key is not None:\n if self.asset_key not in asset_keys_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. It targets asset"\n f" '{self.asset_key.to_user_string()}' which is not targeted by any of the"\n " checks currently being evaluated. Targeted assets:"\n f" {[asset_key.to_user_string() for asset_key in asset_keys_with_specs]}."\n )\n\n resolved_asset_key = self.asset_key\n\n else:\n if len(spec_check_names_by_asset_key) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult didn't specify an asset key, but there are multiple assets"\n " to choose from:"\n f" {[asset_key.to_user_string() for asset_key in spec_check_names_by_asset_key.keys()]}"\n )\n\n resolved_asset_key = next(iter(asset_keys_with_specs))\n\n check_names_with_specs = spec_check_names_by_asset_key[resolved_asset_key]\n if self.check_name is not None:\n if self.check_name not in check_names_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. No checks currently being evaluated"\n f" target asset '{resolved_asset_key.to_user_string()}' and have name"\n f" '{self.check_name}'. Checks being evaluated for this asset:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = self.check_name\n else:\n if len(check_names_with_specs) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult result didn't specify a check name, but there are multiple"\n " checks to choose from for the this asset key:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = next(iter(check_names_with_specs))\n\n input_asset_info = step_context.get_input_asset_version_info(resolved_asset_key)\n if input_asset_info is not None:\n target_materialization_data = AssetCheckEvaluationTargetMaterializationData(\n run_id=input_asset_info.run_id,\n storage_id=input_asset_info.storage_id,\n timestamp=input_asset_info.timestamp,\n )\n else:\n target_materialization_data = None\n\n return AssetCheckEvaluation(\n check_name=resolved_check_name,\n asset_key=resolved_asset_key,\n passed=self.passed,\n metadata=self.metadata,\n target_materialization_data=target_materialization_data,\n severity=self.severity,\n )\n\n def get_spec_python_identifier(\n self, *, asset_key: Optional[AssetKey] = None, check_name: Optional[str] = None\n ) -> str:\n """Returns a string uniquely identifying the asset check spec associated with this result.\n This is used for the output name associated with an `AssetCheckResult`.\n """\n asset_key = asset_key or self.asset_key\n check_name = check_name or self.check_name\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n return f"{asset_key.to_python_identifier()}_{self.check_name}"
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_result"}, "asset_check_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._serdes.serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass AssetCheckSeverity(Enum):\n """Severity level for an asset check.\n\n Severities:\n\n - WARN: If the check fails, don't fail the step.\n - ERROR: If the check fails, fail the step and, within the run, skip materialization of any\n assets that are downstream of the asset being checked.\n """\n\n WARN = "WARN"\n ERROR = "ERROR"
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(old_storage_names={"AssetCheckHandle"})\nclass AssetCheckKey(NamedTuple):\n """Check names are expected to be unique per-asset. Thus, this combination of asset key and\n check name uniquely identifies an asset check within a deployment.\n """\n\n asset_key: PublicAttr[AssetKey]\n name: PublicAttr[str]\n\n @staticmethod\n def from_graphql_input(graphql_input: Mapping[str, Any]) -> "AssetCheckKey":\n return AssetCheckKey(\n asset_key=AssetKey.from_graphql_input(graphql_input["assetKey"]),\n name=graphql_input["name"],\n )
\n\n\n
[docs]@experimental\nclass AssetCheckSpec(\n NamedTuple(\n "_AssetCheckSpec",\n [\n ("name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines information about an asset check, except how to execute it.\n\n AssetCheckSpec is often used as an argument to decorators that decorator a function that can\n execute multiple checks - e.g. `@asset`, and `@multi_asset`. It defines one of the checks that\n will be executed inside that function.\n\n Args:\n name (str): Name of the check.\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The asset that\n the check applies to.\n description (Optional[str]): Description for the check.\n """\n\n def __new__(\n cls,\n name: str,\n *,\n asset: Union[CoercibleToAssetKey, "AssetsDefinition", "SourceAsset"],\n description: Optional[str] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n asset_key=AssetKey.from_coercible_or_definition(asset),\n description=check.opt_str_param(description, "description"),\n )\n\n def get_python_identifier(self) -> str:\n """Returns a string uniquely identifying the asset check, that uses only the characters\n allowed in a Python identifier.\n """\n return f"{self.asset_key.to_python_identifier()}_{self.name}"\n\n @property\n def key(self) -> AssetCheckKey:\n return AssetCheckKey(self.asset_key, self.name)
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_spec"}, "asset_dep": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_dep

\nfrom typing import NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_spec import AssetSpec\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\n\nCoercibleToAssetDep = Union[\n    CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset, "AssetDep"\n]\n\n\n
[docs]@experimental\nclass AssetDep(\n NamedTuple(\n "_AssetDep",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ],\n )\n):\n """Specifies a dependency on an upstream asset.\n\n Attributes:\n asset (Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]): The upstream asset to depend on.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided and the upstream asset is partitioned, defaults to\n the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n\n Examples:\n .. code-block:: python\n\n upstream_asset = AssetSpec("upstream_asset")\n downstream_asset = AssetSpec(\n "downstream_asset",\n deps=[\n AssetDep(\n upstream_asset,\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n )\n ]\n )\n """\n\n def __new__(\n cls,\n asset: Union[CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset],\n *,\n partition_mapping: Optional[PartitionMapping] = None,\n ):\n if isinstance(asset, list):\n check.list_param(asset, "asset", of_type=str)\n else:\n check.inst_param(\n asset, "asset", (AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset)\n )\n if isinstance(asset, AssetsDefinition) and len(asset.keys) > 1:\n # Only AssetsDefinition with a single asset can be passed\n raise DagsterInvalidDefinitionError(\n "Cannot create an AssetDep from a multi_asset AssetsDefinition."\n " Instead, specify dependencies on the assets created by the multi_asset"\n f" via AssetKeys or strings. For the multi_asset {asset.node_def.name}, the"\n f" available keys are: {asset.keys}."\n )\n\n asset_key = _get_asset_key(asset)\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n partition_mapping=check.opt_inst_param(\n partition_mapping,\n "partition_mapping",\n PartitionMapping,\n ),\n )\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetDep") -> "AssetDep":\n # if arg is AssetDep, return the original object to retain partition_mapping\n return arg if isinstance(arg, AssetDep) else AssetDep(asset=arg)
\n\n\ndef _get_asset_key(arg: "CoercibleToAssetDep") -> AssetKey:\n if isinstance(arg, (AssetsDefinition, SourceAsset, AssetSpec)):\n return arg.key\n elif isinstance(arg, AssetDep):\n return arg.asset_key\n else:\n return AssetKey.from_coercible(arg)\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_dep", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_dep"}, "asset_in": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_in

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\nfrom .partition_mapping import PartitionMapping\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[ArbitraryMetadataMapping]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ],\n )\n):\n """Defines an asset dependency.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the input name. Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the input.\n For example, if you only need a subset of columns from an upstream table, you could\n include that in metadata and the IO manager that loads the upstream table could use the\n metadata to determine which columns to load.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided, defaults to the default partition mapping for the\n partitions definition, which is typically maps partition keys to the same partition keys\n in upstream assets.\n dagster_type (DagsterType): Allows specifying type validation functions that\n will be executed on the input of the decorated function before it runs.\n """\n\n def __new__(\n cls,\n key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n input_manager_key: Optional[str] = None,\n partition_mapping: Optional[PartitionMapping] = None,\n dagster_type: Union[DagsterType, Type[NoValueSentinel]] = NoValueSentinel,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n check.invariant(\n not (key and key_prefix), "key and key_prefix cannot both be set on AssetIn"\n )\n\n return super(AssetIn, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n partition_mapping=check.opt_inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_in", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_in"}, "asset_out": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_out

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\n\n
[docs]class AssetOut(\n NamedTuple(\n "_AssetOut",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("io_manager_key", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ("backfill_policy", PublicAttr[Optional[BackfillPolicy]]),\n ],\n )\n):\n """Defines one of the assets produced by a :py:func:`@multi_asset <multi_asset>`.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name. When using ``@multi_asset``, the\n asset name defaults to the key of the "outs" dictionary Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IO manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code that generates this asset.\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n key: Optional[CoercibleToAssetKey] = None,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n return super(AssetOut, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy, "freshness_policy", FreshnessPolicy\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n )\n\n def to_out(self) -> Out:\n return Out(\n dagster_type=self.dagster_type,\n description=self.description,\n metadata=self.metadata,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n code_version=self.code_version,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_out", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_out"}, "asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_selection

\nimport collections.abc\nimport operator\nfrom abc import ABC, abstractmethod\nfrom functools import reduce\nfrom typing import AbstractSet, Iterable, Optional, Sequence, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._core.selector.subset_selector import (\n    fetch_connected,\n    fetch_sinks,\n    fetch_sources,\n    parse_clause,\n)\n\nfrom .asset_check_spec import AssetCheckKey\nfrom .asset_graph import AssetGraph, InternalAssetGraph\nfrom .assets import AssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n    key_prefix_from_coercible,\n)\nfrom .source_asset import SourceAsset\n\nCoercibleToAssetSelection: TypeAlias = Union[\n    str,\n    Sequence[str],\n    Sequence[AssetKey],\n    Sequence[Union["AssetsDefinition", "SourceAsset"]],\n    "AssetSelection",\n]\n\n\n
[docs]class AssetSelection(ABC):\n """An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.\n\n You can use the "|", "&", and "-" operators to create unions, intersections, and differences of selections, respectively.\n\n AssetSelections are typically used with :py:func:`define_asset_job`.\n\n By default, selecting assets will also select all of the asset checks that target those assets.\n\n Examples:\n .. code-block:: python\n\n # Select all assets in group "marketing":\n AssetSelection.groups("marketing")\n\n # Select all assets in group "marketing", as well as the asset with key "promotion":\n AssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n # Select all assets in group "marketing" that are downstream of asset "leads":\n AssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n # Select a list of assets:\n AssetSelection.assets(*my_assets_list)\n\n # Select all assets except for those in group "marketing"\n AssetSelection.all() - AssetSelection.groups("marketing")\n\n # Select all assets which are materialized by the same op as "projections":\n AssetSelection.keys("projections").required_multi_asset_neighbors()\n\n # Select all assets in group "marketing" and exclude their asset checks:\n AssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n # Select all asset checks that target a list of assets:\n AssetSelection.checks_for_assets(*my_assets_list)\n\n # Select a specific asset check:\n AssetSelection.checks(my_asset_check)\n\n """\n\n
[docs] @public\n @staticmethod\n def all() -> "AllSelection":\n """Returns a selection that includes all assets and asset checks."""\n return AllSelection()
\n\n
[docs] @public\n @staticmethod\n def all_asset_checks() -> "AllAssetCheckSelection":\n """Returns a selection that includes all asset checks."""\n return AllAssetCheckSelection()
\n\n
[docs] @public\n @staticmethod\n def assets(*assets_defs: AssetsDefinition) -> "KeysAssetSelection":\n """Returns a selection that includes all of the provided assets and asset checks that target them."""\n return KeysAssetSelection(*(key for assets_def in assets_defs for key in assets_def.keys))
\n\n
[docs] @public\n @staticmethod\n def keys(*asset_keys: CoercibleToAssetKey) -> "KeysAssetSelection":\n """Returns a selection that includes assets with any of the provided keys and all asset checks that target them.\n\n Examples:\n .. code-block:: python\n\n AssetSelection.keys(AssetKey(["a"]))\n\n AssetSelection.keys("a")\n\n AssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\n AssetSelection.keys("a", "b")\n\n asset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\n AssetSelection.keys(*asset_key_list)\n """\n _asset_keys = [\n AssetKey.from_user_string(key) if isinstance(key, str) else AssetKey.from_coercible(key)\n for key in asset_keys\n ]\n return KeysAssetSelection(*_asset_keys)
\n\n
[docs] @public\n @staticmethod\n def key_prefixes(\n *key_prefixes: CoercibleToAssetKeyPrefix, include_sources: bool = False\n ) -> "KeyPrefixesAssetSelection":\n """Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the key prefix(es)\n in the selection.\n\n Examples:\n .. code-block:: python\n\n # match any asset key where the first segment is equal to "a" or "b"\n # e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\n AssetSelection.key_prefixes("a", "b")\n\n # match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\n AssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n """\n _asset_key_prefixes = [key_prefix_from_coercible(key_prefix) for key_prefix in key_prefixes]\n return KeyPrefixesAssetSelection(*_asset_key_prefixes, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def groups(*group_strs, include_sources: bool = False) -> "GroupsAssetSelection":\n """Returns a selection that includes materializable assets that belong to any of the\n provided groups and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the group in the\n selection.\n """\n check.tuple_param(group_strs, "group_strs", of_type=str)\n return GroupsAssetSelection(*group_strs, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def checks_for_assets(*assets_defs: AssetsDefinition) -> "AssetChecksForAssetKeys":\n """Returns a selection with the asset checks that target the provided assets."""\n return AssetChecksForAssetKeys(\n [key for assets_def in assets_defs for key in assets_def.keys]\n )
\n\n
[docs] @public\n @staticmethod\n def checks(*asset_checks: AssetChecksDefinition) -> "AssetChecksForHandles":\n """Returns a selection that includes all of the provided asset checks."""\n return AssetChecksForHandles(\n [\n AssetCheckKey(asset_key=AssetKey.from_coercible(spec.asset_key), name=spec.name)\n for checks_def in asset_checks\n for spec in checks_def.specs\n ]\n )
\n\n
[docs] @public\n def downstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "DownstreamAssetSelection":\n """Returns a selection that includes all assets that are downstream of any of the assets in\n this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\n asset in this selection and returns the union of all downstream assets.\n\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are children or grandchildren of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each downstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return DownstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def upstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "UpstreamAssetSelection":\n """Returns a selection that includes all materializable assets that are upstream of any of\n the assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\n through each asset in this selection and returns the union of all upstream assets.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as upstream of regular assets.\n\n Args:\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are parents or grandparents of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each upstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return UpstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def sinks(self) -> "SinkAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the sink\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A sink asset is an asset that has no downstream dependencies within the asset selection.\n The sink asset can have downstream dependencies outside of the asset selection.\n """\n return SinkAssetSelection(self)
\n\n
[docs] @public\n def required_multi_asset_neighbors(self) -> "RequiredNeighborsAssetSelection":\n """Given an asset selection in which some assets are output from a multi-asset compute op\n which cannot be subset, returns a new asset selection that contains all of the assets\n required to execute the original asset selection. Includes the asset checks targeting the returned assets.\n """\n return RequiredNeighborsAssetSelection(self)
\n\n
[docs] @public\n def roots(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is an asset that has no upstream dependencies within the asset selection.\n The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return RootAssetSelection(self)
\n\n
[docs] @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use AssetSelection.roots instead.")\n def sources(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is a materializable asset that has no upstream dependencies within the asset\n selection. The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return self.roots()
\n\n
[docs] @public\n def upstream_source_assets(self) -> "SourceAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the source\n assets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.\n """\n return SourceAssetSelection(self)
\n\n
[docs] @public\n def without_checks(self) -> "AssetSelection":\n """Removes all asset checks in the selection."""\n return self - AssetSelection.all_asset_checks()
\n\n def __or__(self, other: "AssetSelection") -> "OrAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return OrAssetSelection(self, other)\n\n def __and__(self, other: "AssetSelection") -> "AndAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return AndAssetSelection(self, other)\n\n def __sub__(self, other: "AssetSelection") -> "SubAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return SubAssetSelection(self, other)\n\n def resolve(\n self, all_assets: Union[Iterable[Union[AssetsDefinition, SourceAsset]], AssetGraph]\n ) -> AbstractSet[AssetKey]:\n if isinstance(all_assets, AssetGraph):\n asset_graph = all_assets\n else:\n check.iterable_param(all_assets, "all_assets", (AssetsDefinition, SourceAsset))\n asset_graph = AssetGraph.from_assets(all_assets)\n\n resolved = self.resolve_inner(asset_graph)\n resolved_source_assets = asset_graph.source_asset_keys & resolved\n resolved_regular_assets = resolved - asset_graph.source_asset_keys\n check.invariant(\n not (len(resolved_source_assets) > 0 and len(resolved_regular_assets) > 0),\n "Asset selection specified both regular assets and source assets. This is not"\n " currently supported. Selections must be all regular assets or all source assets.",\n )\n return resolved\n\n @abstractmethod\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n raise NotImplementedError()\n\n def resolve_checks(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """We don't need this method currently, but it makes things consistent with resolve_inner. Currently\n we don't store checks in the ExternalAssetGraph, so we only support InternalAssetGraph.\n """\n return self.resolve_checks_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """By default, resolve to checks that target the selected assets. This is overriden for particular selections."""\n asset_keys = self.resolve(asset_graph)\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in asset_keys}\n\n @staticmethod\n def _selection_from_string(string: str) -> "AssetSelection":\n from dagster._core.definitions import AssetSelection\n\n if string == "*":\n return AssetSelection.all()\n\n parts = parse_clause(string)\n if not parts:\n check.failed(f"Invalid selection string: {string}")\n u, item, d = parts\n\n selection: AssetSelection = AssetSelection.keys(item)\n if u:\n selection = selection.upstream(u)\n if d:\n selection = selection.downstream(d)\n return selection\n\n @classmethod\n def from_coercible(cls, selection: CoercibleToAssetSelection) -> "AssetSelection":\n if isinstance(selection, str):\n return cls._selection_from_string(selection)\n elif isinstance(selection, AssetSelection):\n return selection\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, str) for el in selection\n ):\n return reduce(\n operator.or_, [cls._selection_from_string(cast(str, s)) for s in selection]\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in selection\n ):\n return AssetSelection.keys(\n *(\n key\n for el in selection\n for key in (\n el.keys if isinstance(el, AssetsDefinition) else [cast(SourceAsset, el).key]\n )\n )\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, AssetKey) for el in selection\n ):\n return cls.keys(*cast(Sequence[AssetKey], selection))\n else:\n check.failed(\n "selection argument must be one of str, Sequence[str], Sequence[AssetKey],"\n " Sequence[AssetsDefinition], Sequence[SourceAsset], AssetSelection. Was"\n f" {type(selection)}."\n )
\n\n\nclass AllSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return asset_graph.materializable_asset_keys\n\n\nclass AllAssetCheckSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return asset_graph.asset_check_keys\n\n\nclass AssetChecksForAssetKeys(AssetSelection):\n def __init__(self, keys: Sequence[AssetKey]):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in self._keys}\n\n\nclass AssetChecksForHandles(AssetSelection):\n def __init__(self, asset_check_keys: Sequence[AssetCheckKey]):\n self._asset_check_keys = asset_check_keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {\n handle for handle in asset_graph.asset_check_keys if handle in self._asset_check_keys\n }\n\n\nclass AndAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) & self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) & self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SubAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) - self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) - self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SinkAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sinks(asset_graph.asset_dep_graph, selection)\n\n\nclass RequiredNeighborsAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n output = set(selection)\n for asset_key in selection:\n output.update(asset_graph.get_required_multi_asset_keys(asset_key))\n return output\n\n\nclass RootAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sources(asset_graph.asset_dep_graph, selection)\n\n\nclass DownstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: Optional[bool] = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="downstream",\n depth=self.depth,\n )\n for asset_key in selection\n ],\n ),\n selection if not self.include_self else set(),\n )\n\n\nclass GroupsAssetSelection(AssetSelection):\n def __init__(self, *groups: str, include_sources: bool):\n self._groups = groups\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n asset_key\n for asset_key, group in asset_graph.group_names_by_key.items()\n if group in self._groups and asset_key in base_set\n }\n\n\nclass KeysAssetSelection(AssetSelection):\n def __init__(self, *keys: AssetKey):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n specified_keys = set(self._keys)\n invalid_keys = {key for key in specified_keys if key not in asset_graph.all_asset_keys}\n if invalid_keys:\n raise DagsterInvalidSubsetError(\n f"AssetKey(s) {invalid_keys} were selected, but no AssetsDefinition objects supply "\n "these keys. Make sure all keys are spelled correctly, and all AssetsDefinitions "\n "are correctly added to the `Definitions`."\n )\n return specified_keys\n\n\nclass KeyPrefixesAssetSelection(AssetSelection):\n def __init__(self, *key_prefixes: Sequence[str], include_sources: bool):\n self._key_prefixes = key_prefixes\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n key for key in base_set if any(key.has_prefix(prefix) for prefix in self._key_prefixes)\n }\n\n\nclass OrAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) | self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) | self._right.resolve_checks_inner(\n asset_graph\n )\n\n\ndef _fetch_all_upstream(\n selection: AbstractSet[AssetKey],\n asset_graph: AssetGraph,\n depth: Optional[int] = None,\n include_self: bool = True,\n) -> AbstractSet[AssetKey]:\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="upstream",\n depth=depth,\n )\n for asset_key in selection\n ],\n set(),\n ),\n selection if not include_self else set(),\n )\n\n\nclass UpstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: bool = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph, self.depth, self.include_self)\n return {key for key in all_upstream if key not in asset_graph.source_asset_keys}\n\n\nclass SourceAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph)\n return {key for key in all_upstream if key in asset_graph.source_asset_keys}\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_selection"}, "asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_sensor_definition

\nimport inspect\nfrom typing import Any, Callable, NamedTuple, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_annotation import get_resource_args\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    SensorDefinition,\n    SensorType,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\n\nclass AssetSensorParamNames(NamedTuple):\n    context_param_name: Optional[str]\n    event_log_entry_param_name: Optional[str]\n\n\ndef get_asset_sensor_param_names(fn: Callable) -> AssetSensorParamNames:\n    """Determines the names of the context and event log entry parameters for an asset sensor function.\n    These are assumed to be the first two non-resource params, in order (context param before event log entry).\n    """\n    resource_params = {param.name for param in get_resource_args(fn)}\n\n    non_resource_params = [\n        param.name for param in get_function_params(fn) if param.name not in resource_params\n    ]\n\n    context_param_name = non_resource_params[0] if len(non_resource_params) > 0 else None\n    event_log_entry_param_name = non_resource_params[1] if len(non_resource_params) > 1 else None\n\n    return AssetSensorParamNames(\n        context_param_name=context_param_name, event_log_entry_param_name=event_log_entry_param_name\n    )\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n job_name: Optional[str],\n asset_materialization_fn: Callable[\n ...,\n RawSensorEvaluationFunctionReturn,\n ],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn) -> Any:\n def _fn(context) -> Any:\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n yield SkipReason(\n f"No new materialization events found for asset key {self._asset_key}"\n )\n return\n\n event_record = event_records[0]\n\n (\n context_param_name,\n event_log_entry_param_name,\n ) = get_asset_sensor_param_names(materialization_fn)\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n # Build asset sensor function args, which can include any subset of\n # context arg, event log entry arg, and any resource args\n args = resource_args_populated\n if context_param_name:\n args[context_param_name] = context\n if event_log_entry_param_name:\n args[event_log_entry_param_name] = event_record.event_log_entry\n\n result = materialization_fn(**args)\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """AssetKey: The key of the asset targeted by this sensor."""\n return self._asset_key\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_sensor_definition"}, "asset_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .auto_materialize_policy import AutoMaterializePolicy\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .freshness_policy import FreshnessPolicy\nfrom .metadata import MetadataUserInput\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\n\n# SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE lives on the metadata of an asset\n# (which currently ends up on the Output associated with the asset key)\n# whih encodes the execution type the of asset. "Unexecutable" assets are assets\n# that cannot be materialized in Dagster, but can have events in the event\n# log keyed off of them, making Dagster usable as a observability and lineage tool\n# for externally materialized assets.\nSYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE = "dagster/asset_execution_type"\n\n\nclass AssetExecutionType(Enum):\n    OBSERVATION = "OBSERVATION"\n    UNEXECUTABLE = "UNEXECUTABLE"\n    MATERIALIZATION = "MATERIALIZATION"\n\n    @staticmethod\n    def is_executable(varietal_str: Optional[str]) -> bool:\n        return AssetExecutionType.str_to_enum(varietal_str) in {\n            AssetExecutionType.MATERIALIZATION,\n            AssetExecutionType.OBSERVATION,\n        }\n\n    @staticmethod\n    def str_to_enum(varietal_str: Optional[str]) -> "AssetExecutionType":\n        return (\n            AssetExecutionType.MATERIALIZATION\n            if varietal_str is None\n            else AssetExecutionType(varietal_str)\n        )\n\n\n
[docs]@experimental\nclass AssetSpec(\n NamedTuple(\n "_AssetSpec",\n [\n ("key", PublicAttr[AssetKey]),\n ("deps", PublicAttr[Iterable["AssetDep"]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("skippable", PublicAttr[bool]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ],\n )\n):\n """Specifies the core attributes of an asset. This object is attached to the decorated\n function that defines how it materialized.\n\n Attributes:\n key (AssetKey): The unique identifier for this asset.\n deps (Optional[AbstractSet[AssetKey]]): The asset keys for the upstream assets that\n materializing this asset depends on.\n description (Optional[str]): Human-readable description of this asset.\n metadata (Optional[Dict[str, Any]]): A dict of static metadata for this asset.\n For example, users can provide information about the database table this\n asset corresponds to.\n skippable (bool): Whether this asset can be omitted during materialization, causing downstream\n dependencies to skip.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code for this specific asset,\n overriding the code version of the materialization function\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key: CoercibleToAssetKey,\n *,\n deps: Optional[Iterable["CoercibleToAssetDep"]] = None,\n description: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n skippable: bool = False,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n ):\n from dagster._core.definitions.asset_dep import AssetDep\n\n dep_set = {}\n if deps:\n for dep in deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys.\n if asset_dep.asset_key in dep_set.keys():\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once for"\n f" AssetSpec {key}"\n )\n dep_set[asset_dep.asset_key] = asset_dep\n\n return super().__new__(\n cls,\n key=AssetKey.from_coercible(key),\n deps=list(dep_set.values()),\n description=check.opt_str_param(description, "description"),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n skippable=check.bool_param(skippable, "skippable"),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy,\n "freshness_policy",\n FreshnessPolicy,\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy,\n "auto_materialize_policy",\n AutoMaterializePolicy,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_spec"}, "assets": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.assets

\nimport hashlib\nimport json\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_layer import get_dep_node_handles_of_graph_backed_asset\nfrom dagster._core.definitions.asset_spec import AssetExecutionType\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.op_selection import get_graph_subset\nfrom dagster._core.definitions.partition_mapping import MultiPartitionMapping\nfrom dagster._core.definitions.resource_requirement import (\n    RequiresResources,\n    ResourceAddable,\n    ResourceRequirement,\n    merge_resource_defs,\n)\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom .dependency import NodeHandle\nfrom .events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom .node_definition import NodeDefinition\nfrom .op_definition import OpDefinition\nfrom .partition import PartitionsDefinition\nfrom .partition_mapping import (\n    PartitionMapping,\n    get_builtin_partition_mapping_types,\n    infer_partition_mapping,\n)\nfrom .resource_definition import ResourceDefinition\nfrom .source_asset import SourceAsset\nfrom .utils import DEFAULT_GROUP_NAME, validate_group_name\n\nif TYPE_CHECKING:\n    from .graph_definition import GraphDefinition\n\n\n
[docs]class AssetsDefinition(ResourceAddable, RequiresResources, IHasInternalInit):\n """Defines a set of assets that are produced by the same op or graph.\n\n AssetsDefinitions are typically not instantiated directly, but rather produced using the\n :py:func:`@asset <asset>` or :py:func:`@multi_asset <multi_asset>` decorators.\n """\n\n _node_def: NodeDefinition\n _keys_by_input_name: Mapping[str, AssetKey]\n _keys_by_output_name: Mapping[str, AssetKey]\n _partitions_def: Optional[PartitionsDefinition]\n _partition_mappings: Mapping[AssetKey, PartitionMapping]\n _asset_deps: Mapping[AssetKey, AbstractSet[AssetKey]]\n _resource_defs: Mapping[str, ResourceDefinition]\n _group_names_by_key: Mapping[AssetKey, str]\n _selected_asset_keys: AbstractSet[AssetKey]\n _can_subset: bool\n _metadata_by_key: Mapping[AssetKey, ArbitraryMetadataMapping]\n _freshness_policies_by_key: Mapping[AssetKey, FreshnessPolicy]\n _auto_materialize_policies_by_key: Mapping[AssetKey, AutoMaterializePolicy]\n _backfill_policy: Optional[BackfillPolicy]\n _code_versions_by_key: Mapping[AssetKey, Optional[str]]\n _descriptions_by_key: Mapping[AssetKey, str]\n _selected_asset_check_keys: AbstractSet[AssetCheckKey]\n\n def __init__(\n self,\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]] = None,\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]] = None,\n selected_asset_keys: Optional[AbstractSet[AssetKey]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]] = None,\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]] = None,\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]] = None,\n # if adding new fields, make sure to handle them in the with_attributes, from_graph, and\n # get_attributes_dict methods\n ):\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .graph_definition import GraphDefinition\n\n if isinstance(node_def, GraphDefinition):\n _validate_graph_def(node_def)\n\n self._node_def = node_def\n self._keys_by_input_name = check.mapping_param(\n keys_by_input_name,\n "keys_by_input_name",\n key_type=str,\n value_type=AssetKey,\n )\n self._keys_by_output_name = check.mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n\n check.opt_mapping_param(\n check_specs_by_output_name,\n "check_specs_by_output_name",\n key_type=str,\n value_type=AssetCheckSpec,\n )\n\n # if not specified assume all output assets depend on all input assets\n all_asset_keys = set(keys_by_output_name.values())\n input_asset_keys = set(keys_by_input_name.values())\n\n self._partitions_def = partitions_def\n self._partition_mappings = partition_mappings or {}\n builtin_partition_mappings = get_builtin_partition_mapping_types()\n for asset_key, partition_mapping in self._partition_mappings.items():\n if not isinstance(partition_mapping, builtin_partition_mappings):\n warnings.warn(\n f"Non-built-in PartitionMappings, such as {type(partition_mapping).__name__} "\n "are deprecated and will not work with asset reconciliation. The built-in "\n "partition mappings are "\n + ", ".join(\n builtin_partition_mapping.__name__\n for builtin_partition_mapping in builtin_partition_mappings\n )\n + ".",\n category=DeprecationWarning,\n )\n\n if asset_key not in input_asset_keys:\n check.failed(\n f"While constructing AssetsDefinition outputting {all_asset_keys}, received a"\n f" partition mapping for {asset_key} that is not defined in the set of upstream"\n f" assets: {input_asset_keys}"\n )\n\n self._asset_deps = asset_deps or {\n out_asset_key: set(keys_by_input_name.values()) for out_asset_key in all_asset_keys\n }\n check.invariant(\n set(self._asset_deps.keys()) == all_asset_keys,\n "The set of asset keys with dependencies specified in the asset_deps argument must "\n "equal the set of asset keys produced by this AssetsDefinition. \\n"\n f"asset_deps keys: {set(self._asset_deps.keys())} \\n"\n f"expected keys: {all_asset_keys}",\n )\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs")\n )\n\n group_names_by_key = (\n check.mapping_param(group_names_by_key, "group_names_by_key")\n if group_names_by_key\n else {}\n )\n self._group_names_by_key = {}\n # assets that don't have a group name get a DEFAULT_GROUP_NAME\n for key in all_asset_keys:\n group_name = group_names_by_key.get(key)\n self._group_names_by_key[key] = validate_group_name(group_name)\n\n all_check_keys = {spec.key for spec in (check_specs_by_output_name or {}).values()}\n\n # NOTE: this logic mirrors subsetting at the asset layer. This is ripe for consolidation.\n if selected_asset_keys is None and selected_asset_check_keys is None:\n # if no selections, include everything\n self._selected_asset_keys = all_asset_keys\n self._selected_asset_check_keys = all_check_keys\n else:\n self._selected_asset_keys = selected_asset_keys or set()\n\n if selected_asset_check_keys is None:\n # if assets were selected but checks are None, then include all checks for selected\n # assets\n self._selected_asset_check_keys = {\n key for key in all_check_keys if key.asset_key in self._selected_asset_keys\n }\n else:\n # otherwise, use the selected checks\n self._selected_asset_check_keys = selected_asset_check_keys\n\n self._check_specs_by_output_name = {\n name: spec\n for name, spec in (check_specs_by_output_name or {}).items()\n if spec.key in self._selected_asset_check_keys\n }\n self._check_specs_by_key = {\n spec.key: spec for spec in self._check_specs_by_output_name.values()\n }\n\n self._can_subset = can_subset\n\n self._code_versions_by_key = {}\n self._metadata_by_key = dict(\n check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n )\n self._descriptions_by_key = dict(\n check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n )\n for output_name, asset_key in keys_by_output_name.items():\n output_def, _ = node_def.resolve_output_to_origin(output_name, None)\n self._metadata_by_key[asset_key] = merge_dicts(\n output_def.metadata,\n self._metadata_by_key.get(asset_key, {}),\n )\n # We construct description from three sources of truth here. This\n # highly unfortunate. See commentary in @multi_asset's call to dagster_internal_init.\n description = (\n self._descriptions_by_key.get(asset_key, output_def.description)\n or node_def.description\n )\n if description:\n self._descriptions_by_key[asset_key] = description\n self._code_versions_by_key[asset_key] = output_def.code_version\n\n for key, freshness_policy in (freshness_policies_by_key or {}).items():\n check.param_invariant(\n not (\n freshness_policy\n and self._partitions_def is not None\n and not isinstance(self._partitions_def, TimeWindowPartitionsDefinition)\n ),\n "freshness_policies_by_key",\n "FreshnessPolicies are currently unsupported for assets with partitions of type"\n f" {type(self._partitions_def)}.",\n )\n\n self._freshness_policies_by_key = check.opt_mapping_param(\n freshness_policies_by_key,\n "freshness_policies_by_key",\n key_type=AssetKey,\n value_type=FreshnessPolicy,\n )\n\n self._auto_materialize_policies_by_key = check.opt_mapping_param(\n auto_materialize_policies_by_key,\n "auto_materialize_policies_by_key",\n key_type=AssetKey,\n value_type=AutoMaterializePolicy,\n )\n\n self._backfill_policy = check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n )\n\n if self._partitions_def is None:\n # check if backfill policy is BackfillPolicyType.SINGLE_RUN if asset is not partitioned\n check.param_invariant(\n (\n backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n _validate_self_deps(\n input_keys=self._keys_by_input_name.values(),\n output_keys=self._selected_asset_keys,\n partition_mappings=self._partition_mappings,\n partitions_def=self._partitions_def,\n )\n\n @staticmethod\n def dagster_internal_init(\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition],\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]],\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]],\n selected_asset_keys: Optional[AbstractSet[AssetKey]],\n can_subset: bool,\n resource_defs: Optional[Mapping[str, object]],\n group_names_by_key: Optional[Mapping[AssetKey, str]],\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]],\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]],\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]],\n backfill_policy: Optional[BackfillPolicy],\n descriptions_by_key: Optional[Mapping[AssetKey, str]],\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n return AssetsDefinition(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=node_def,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n asset_deps=asset_deps,\n selected_asset_keys=selected_asset_keys,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n metadata_by_key=metadata_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n descriptions_by_key=descriptions_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=selected_asset_check_keys,\n )\n\n def __call__(self, *args: object, **kwargs: object) -> object:\n from .composition import is_in_composition\n from .graph_definition import GraphDefinition\n\n # defer to GraphDefinition.__call__ for graph backed assets, or if invoked in composition\n if isinstance(self.node_def, GraphDefinition) or is_in_composition():\n return self._node_def(*args, **kwargs)\n\n # invoke against self to allow assets def information to be used\n return direct_invocation_result(self, *args, **kwargs)\n\n
[docs] @public\n @experimental_param(param="resource_defs")\n @staticmethod\n def from_graph(\n graph_def: "GraphDefinition",\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from a GraphDefinition.\n\n Args:\n graph_def (GraphDefinition): The GraphDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated graph to their corresponding asset keys. If not provided,\n the input asset keys will be created from the graph input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated graph to their corresponding asset keys. If not provided,\n the output asset keys will be created from the graph output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the graph depend on all assets that are consumed by that\n graph. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the graph.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n body of ops in the graph during execution.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=graph_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n resource_defs=resource_defs,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n check_specs=check_specs,\n )
\n\n
[docs] @public\n @staticmethod\n def from_op(\n op_def: OpDefinition,\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from an OpDefinition.\n\n Args:\n op_def (OpDefinition): The OpDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated op to their corresponding asset keys. If not provided,\n the input asset keys will be created from the op input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated op to their corresponding asset keys. If not provided,\n the output asset keys will be created from the op output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the op depend on all assets that are consumed by that\n op. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=op_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n )
\n\n @staticmethod\n def _from_node(\n node_def: Union[OpDefinition, "GraphDefinition"],\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n from dagster._core.definitions.decorators.asset_decorator import (\n _validate_and_assign_output_names_to_check_specs,\n )\n\n node_def = check.inst_param(node_def, "node_def", NodeDefinition)\n keys_by_input_name = _infer_keys_by_input_names(\n node_def,\n check.opt_mapping_param(\n keys_by_input_name, "keys_by_input_name", key_type=str, value_type=AssetKey\n ),\n )\n keys_by_output_name = check.opt_mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n internal_asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n transformed_internal_asset_deps: Dict[AssetKey, AbstractSet[AssetKey]] = {}\n if internal_asset_deps:\n for output_name, asset_keys in internal_asset_deps.items():\n check.invariant(\n output_name in keys_by_output_name,\n f"output_name {output_name} specified in internal_asset_deps does not exist"\n " in the decorated function",\n )\n transformed_internal_asset_deps[keys_by_output_name[output_name]] = asset_keys\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(keys_by_output_name.values())\n )\n\n keys_by_output_name = _infer_keys_by_output_names(\n node_def, keys_by_output_name or {}, check_specs_by_output_name\n )\n\n keys_by_output_name_with_prefix: Dict[str, AssetKey] = {}\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n for output_name, key in keys_by_output_name.items():\n # add key_prefix to the beginning of each asset key\n key_with_key_prefix = AssetKey(\n list(filter(None, [*(key_prefix_list or []), *key.path]))\n )\n keys_by_output_name_with_prefix[output_name] = key_with_key_prefix\n\n check.param_invariant(\n group_name is None or group_names_by_output_name is None,\n "group_name",\n "Cannot use both group_name and group_names_by_output_name",\n )\n\n if group_name:\n group_names_by_key = {\n asset_key: group_name for asset_key in keys_by_output_name_with_prefix.values()\n }\n elif group_names_by_output_name:\n group_names_by_key = {\n keys_by_output_name_with_prefix[output_name]: group_name\n for output_name, group_name in group_names_by_output_name.items()\n if group_name is not None\n }\n else:\n group_names_by_key = None\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name_with_prefix,\n node_def=node_def,\n asset_deps=transformed_internal_asset_deps or None,\n partitions_def=check.opt_inst_param(\n partitions_def,\n "partitions_def",\n PartitionsDefinition,\n ),\n group_names_by_key=group_names_by_key,\n resource_defs=resource_defs,\n partition_mappings=(\n {\n keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in partition_mappings.items()\n }\n if partition_mappings\n else None\n ),\n metadata_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: metadata\n for output_name, metadata in metadata_by_output_name.items()\n if metadata is not None\n }\n if metadata_by_output_name\n else None\n ),\n freshness_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: freshness_policy\n for output_name, freshness_policy in freshness_policies_by_output_name.items()\n if freshness_policy is not None\n }\n if freshness_policies_by_output_name\n else None\n ),\n auto_materialize_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: auto_materialize_policy\n for output_name, auto_materialize_policy in auto_materialize_policies_by_output_name.items()\n if auto_materialize_policy is not None\n }\n if auto_materialize_policies_by_output_name\n else None\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n descriptions_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: description\n for output_name, description in descriptions_by_output_name.items()\n if description is not None\n }\n if descriptions_by_output_name\n else None\n ),\n can_subset=can_subset,\n selected_asset_keys=None, # node has no subselection info\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None,\n )\n\n @public\n @property\n def can_subset(self) -> bool:\n """bool: If True, indicates that this AssetsDefinition may materialize any subset of its\n asset keys in a given computation (as opposed to being required to materialize all asset\n keys).\n """\n return self._can_subset\n\n @public\n @property\n def group_names_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the group names assigned to them. If there is no assigned group name for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._group_names_by_key\n\n @public\n @property\n def descriptions_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the descriptions assigned to them. If there is no assigned description for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._descriptions_by_key\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: Returns the OpDefinition that is used to materialize the assets in this\n AssetsDefinition.\n """\n check.invariant(\n isinstance(self._node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self._node_def)\n\n @public\n @property\n def node_def(self) -> NodeDefinition:\n """NodeDefinition: Returns the OpDefinition or GraphDefinition that is used to materialize\n the assets in this AssetsDefinition.\n """\n return self._node_def\n\n @public\n @property\n def asset_deps(self) -> Mapping[AssetKey, AbstractSet[AssetKey]]:\n """Maps assets that are produced by this definition to assets that they depend on. The\n dependencies can be either "internal", meaning that they refer to other assets that are\n produced by this definition, or "external", meaning that they refer to assets that aren't\n produced by this definition.\n """\n return self._asset_deps\n\n @property\n def input_names(self) -> Iterable[str]:\n """Iterable[str]: The set of input names of the underlying NodeDefinition for this\n AssetsDefinition.\n """\n return self.keys_by_input_name.keys()\n\n @public\n @property\n def key(self) -> AssetKey:\n """AssetKey: The asset key associated with this AssetsDefinition. If this AssetsDefinition\n has more than one asset key, this will produce an error.\n """\n check.invariant(\n len(self.keys) == 1,\n "Tried to retrieve asset key from an assets definition with multiple asset keys: "\n + ", ".join([str(ak.to_string()) for ak in self._keys_by_output_name.values()]),\n )\n\n return next(iter(self.keys))\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Mapping[str, ResourceDefinition]: A mapping from resource name to ResourceDefinition for\n the resources bound to this AssetsDefinition.\n """\n return dict(self._resource_defs)\n\n @public\n @property\n def keys(self) -> AbstractSet[AssetKey]:\n """AbstractSet[AssetKey]: The asset keys associated with this AssetsDefinition."""\n return self._selected_asset_keys\n\n @public\n @property\n def dependency_keys(self) -> Iterable[AssetKey]:\n """Iterable[AssetKey]: The asset keys which are upstream of any asset included in this\n AssetsDefinition.\n """\n # the input asset keys that are directly upstream of a selected asset key\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n input_keys = set(self._keys_by_input_name.values())\n return upstream_keys.intersection(input_keys)\n\n @property\n def node_keys_by_output_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each output on the underlying NodeDefinition."""\n return self._keys_by_output_name\n\n @property\n def node_keys_by_input_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each input on the underlying NodeDefinition."""\n return self._keys_by_input_name\n\n @property\n def check_specs_by_output_name(self) -> Mapping[str, AssetCheckSpec]:\n return self._check_specs_by_output_name\n\n def get_spec_for_check_key(self, asset_check_key: AssetCheckKey) -> AssetCheckSpec:\n return self._check_specs_by_key[asset_check_key]\n\n @property\n def keys_by_output_name(self) -> Mapping[str, AssetKey]:\n return {\n name: key for name, key in self.node_keys_by_output_name.items() if key in self.keys\n }\n\n @property\n def keys_by_input_name(self) -> Mapping[str, AssetKey]:\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n return {\n name: key for name, key in self.node_keys_by_input_name.items() if key in upstream_keys\n }\n\n @property\n def freshness_policies_by_key(self) -> Mapping[AssetKey, FreshnessPolicy]:\n return self._freshness_policies_by_key\n\n @property\n def auto_materialize_policies_by_key(self) -> Mapping[AssetKey, AutoMaterializePolicy]:\n return self._auto_materialize_policies_by_key\n\n @property\n def backfill_policy(self) -> Optional[BackfillPolicy]:\n return self._backfill_policy\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Optional[PartitionsDefinition]: The PartitionsDefinition for this AssetsDefinition (if any)."""\n return self._partitions_def\n\n @property\n def metadata_by_key(self) -> Mapping[AssetKey, ArbitraryMetadataMapping]:\n return self._metadata_by_key\n\n @property\n def code_versions_by_key(self) -> Mapping[AssetKey, Optional[str]]:\n return self._code_versions_by_key\n\n @property\n def partition_mappings(self) -> Mapping[AssetKey, PartitionMapping]:\n return self._partition_mappings\n\n
[docs] @public\n def get_partition_mapping(self, in_asset_key: AssetKey) -> Optional[PartitionMapping]:\n """Returns the partition mapping between keys in this AssetsDefinition and a given input\n asset key (if any).\n """\n return self._partition_mappings.get(in_asset_key)
\n\n @public\n @property\n def check_specs(self) -> Iterable[AssetCheckSpec]:\n """Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\n be executed while materializing the assets.\n\n Returns:\n Iterable[AssetsCheckSpec]:\n """\n return self._check_specs_by_output_name.values()\n\n @property\n def check_keys(self) -> AbstractSet[AssetCheckKey]:\n """Returns the selected asset checks associated by this AssetsDefinition.\n\n Returns:\n AbstractSet[Tuple[AssetKey, str]]: The selected asset checks. An asset check is\n identified by the asset key and the name of the check.\n """\n return self._selected_asset_check_keys\n\n def is_asset_executable(self, asset_key: AssetKey) -> bool:\n """Returns True if the asset key is materializable by this AssetsDefinition.\n\n Args:\n asset_key (AssetKey): The asset key to check.\n\n Returns:\n bool: True if the asset key is materializable by this AssetsDefinition.\n """\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.is_executable(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def asset_execution_type_for_asset(self, asset_key: AssetKey) -> AssetExecutionType:\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.str_to_enum(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def get_partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n return self._partition_mappings.get(self._keys_by_input_name[input_name])\n\n def infer_partition_mapping(\n self, upstream_asset_key: AssetKey, upstream_partitions_def: Optional[PartitionsDefinition]\n ) -> PartitionMapping:\n with disable_dagster_warnings():\n partition_mapping = self._partition_mappings.get(upstream_asset_key)\n return infer_partition_mapping(\n partition_mapping, self._partitions_def, upstream_partitions_def\n )\n\n def get_output_name_for_asset_key(self, key: AssetKey) -> str:\n for output_name, asset_key in self.keys_by_output_name.items():\n if key == asset_key:\n return output_name\n\n raise DagsterInvariantViolationError(\n f"Asset key {key.to_user_string()} not found in AssetsDefinition"\n )\n\n def get_op_def_for_asset_key(self, key: AssetKey) -> OpDefinition:\n """If this is an op-backed asset, returns the op def. If it's a graph-backed asset,\n returns the op def within the graph that produces the given asset key.\n """\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin_op_def(output_name)\n\n def with_attributes(\n self,\n *,\n output_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n input_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policy: Optional[\n Union[FreshnessPolicy, Mapping[AssetKey, FreshnessPolicy]]\n ] = None,\n auto_materialize_policy: Optional[\n Union[AutoMaterializePolicy, Mapping[AssetKey, AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ) -> "AssetsDefinition":\n output_asset_key_replacements = check.opt_mapping_param(\n output_asset_key_replacements,\n "output_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n input_asset_key_replacements = check.opt_mapping_param(\n input_asset_key_replacements,\n "input_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n group_names_by_key = check.opt_mapping_param(\n group_names_by_key, "group_names_by_key", key_type=AssetKey, value_type=str\n )\n descriptions_by_key = check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n metadata_by_key = check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n if group_names_by_key:\n group_name_conflicts = [\n asset_key\n for asset_key in group_names_by_key\n if asset_key in self.group_names_by_key\n and self.group_names_by_key[asset_key] != DEFAULT_GROUP_NAME\n ]\n if group_name_conflicts:\n raise DagsterInvalidDefinitionError(\n "Group name already exists on assets"\n f" {', '.join(asset_key.to_user_string() for asset_key in group_name_conflicts)}"\n )\n\n replaced_group_names_by_key = {\n output_asset_key_replacements.get(key, key): group_name\n for key, group_name in self.group_names_by_key.items()\n }\n\n if freshness_policy:\n freshness_policy_conflicts = (\n self.freshness_policies_by_key.keys()\n if isinstance(freshness_policy, FreshnessPolicy)\n else (freshness_policy.keys() & self.freshness_policies_by_key.keys())\n )\n if freshness_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "FreshnessPolicy already exists on assets"\n f" {', '.join(key.to_string() for key in freshness_policy_conflicts)}"\n )\n\n replaced_freshness_policies_by_key = {}\n for key in self.keys:\n if isinstance(freshness_policy, FreshnessPolicy):\n replaced_freshness_policy = freshness_policy\n elif freshness_policy:\n replaced_freshness_policy = freshness_policy.get(key)\n else:\n replaced_freshness_policy = self.freshness_policies_by_key.get(key)\n\n if replaced_freshness_policy:\n replaced_freshness_policies_by_key[output_asset_key_replacements.get(key, key)] = (\n replaced_freshness_policy\n )\n\n if auto_materialize_policy:\n auto_materialize_policy_conflicts = (\n self.auto_materialize_policies_by_key.keys()\n if isinstance(auto_materialize_policy, AutoMaterializePolicy)\n else (auto_materialize_policy.keys() & self.auto_materialize_policies_by_key.keys())\n )\n if auto_materialize_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "AutoMaterializePolicy already exists on assets"\n f" {', '.join(key.to_string() for key in auto_materialize_policy_conflicts)}"\n )\n\n replaced_auto_materialize_policies_by_key = {}\n for key in self.keys:\n if isinstance(auto_materialize_policy, AutoMaterializePolicy):\n replaced_auto_materialize_policy = auto_materialize_policy\n elif auto_materialize_policy:\n replaced_auto_materialize_policy = auto_materialize_policy.get(key)\n else:\n replaced_auto_materialize_policy = self.auto_materialize_policies_by_key.get(key)\n\n if replaced_auto_materialize_policy:\n replaced_auto_materialize_policies_by_key[\n output_asset_key_replacements.get(key, key)\n ] = replaced_auto_materialize_policy\n\n replaced_descriptions_by_key = {\n output_asset_key_replacements.get(key, key): description\n for key, description in descriptions_by_key.items()\n }\n\n if not metadata_by_key:\n metadata_by_key = self.metadata_by_key\n\n replaced_metadata_by_key = {\n output_asset_key_replacements.get(key, key): metadata\n for key, metadata in metadata_by_key.items()\n }\n\n replaced_attributes = dict(\n keys_by_input_name={\n input_name: input_asset_key_replacements.get(key, key)\n for input_name, key in self._keys_by_input_name.items()\n },\n keys_by_output_name={\n output_name: output_asset_key_replacements.get(key, key)\n for output_name, key in self._keys_by_output_name.items()\n },\n partition_mappings={\n input_asset_key_replacements.get(key, key): partition_mapping\n for key, partition_mapping in self._partition_mappings.items()\n },\n asset_deps={\n # replace both the keys and the values in this mapping\n output_asset_key_replacements.get(key, key): {\n input_asset_key_replacements.get(\n upstream_key,\n output_asset_key_replacements.get(upstream_key, upstream_key),\n )\n for upstream_key in value\n }\n for key, value in self.asset_deps.items()\n },\n selected_asset_keys={\n output_asset_key_replacements.get(key, key) for key in self._selected_asset_keys\n },\n group_names_by_key={\n **replaced_group_names_by_key,\n **group_names_by_key,\n },\n metadata_by_key=replaced_metadata_by_key,\n freshness_policies_by_key=replaced_freshness_policies_by_key,\n auto_materialize_policies_by_key=replaced_auto_materialize_policies_by_key,\n backfill_policy=backfill_policy if backfill_policy else self.backfill_policy,\n descriptions_by_key=replaced_descriptions_by_key,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n def _subset_graph_backed_asset(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n ):\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n if not isinstance(self.node_def, GraphDefinition):\n raise DagsterInvalidInvocationError(\n "Method _subset_graph_backed_asset cannot subset an asset that is not a graph"\n )\n\n # All asset keys in selected_asset_keys are outputted from the same top-level graph backed asset\n dep_node_handles_by_asset_key = get_dep_node_handles_of_graph_backed_asset(\n self.node_def, self\n )\n op_selection: List[str] = []\n for asset_key in selected_asset_keys:\n dep_node_handles = dep_node_handles_by_asset_key[asset_key]\n for dep_node_handle in dep_node_handles:\n op_selection.append(".".join(dep_node_handle.path[1:]))\n\n return get_graph_subset(self.node_def, op_selection)\n\n def subset_for(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n """Create a subset of this AssetsDefinition that will only materialize the assets and checks\n in the selected set.\n\n Args:\n selected_asset_keys (AbstractSet[AssetKey]): The total set of asset keys\n selected_asset_check_keys (AbstractSet[AssetCheckKey]): The selected asset checks\n """\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n check.invariant(\n self.can_subset,\n f"Attempted to subset AssetsDefinition for {self.node_def.name}, but can_subset=False.",\n )\n\n # Set of assets within selected_asset_keys which are outputted by this AssetDefinition\n asset_subselection = selected_asset_keys & self.keys\n if selected_asset_check_keys is None:\n # filter to checks that target selected asset keys\n asset_check_subselection = {\n key for key in self.check_keys if key.asset_key in asset_subselection\n }\n else:\n asset_check_subselection = selected_asset_check_keys & self.check_keys\n\n # Early escape if all assets in AssetsDefinition are selected\n if asset_subselection == self.keys and asset_check_subselection == self.check_keys:\n return self\n elif isinstance(self.node_def, GraphDefinition): # Node is graph-backed asset\n check.invariant(\n selected_asset_check_keys == self.check_keys,\n "Subsetting graph-backed assets with checks is not yet supported",\n )\n\n subsetted_node = self._subset_graph_backed_asset(\n asset_subselection,\n )\n\n # The subsetted node should only include asset inputs that are dependencies of the\n # selected set of assets.\n subsetted_input_names = [input_def.name for input_def in subsetted_node.input_defs]\n subsetted_keys_by_input_name = {\n key: value\n for key, value in self.node_keys_by_input_name.items()\n if key in subsetted_input_names\n }\n\n subsetted_output_names = [output_def.name for output_def in subsetted_node.output_defs]\n subsetted_keys_by_output_name = {\n key: value\n for key, value in self.node_keys_by_output_name.items()\n if key in subsetted_output_names\n }\n\n # An op within the graph-backed asset that yields multiple assets will be run\n # any time any of its output assets are selected. Thus, if an op yields multiple assets\n # and only one of them is selected, the op will still run and potentially unexpectedly\n # materialize the unselected asset.\n #\n # Thus, we include unselected assets that may be accidentally materialized in\n # keys_by_output_name and asset_deps so that the webserver can populate an warning when\n # this occurs. This is the same behavior as multi-asset subsetting.\n\n subsetted_asset_deps = {\n out_asset_key: set(self._keys_by_input_name.values())\n for out_asset_key in subsetted_keys_by_output_name.values()\n }\n\n replaced_attributes = dict(\n keys_by_input_name=subsetted_keys_by_input_name,\n keys_by_output_name=subsetted_keys_by_output_name,\n node_def=subsetted_node,\n asset_deps=subsetted_asset_deps,\n selected_asset_keys=selected_asset_keys & self.keys,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n else:\n # multi_asset subsetting\n replaced_attributes = {\n "selected_asset_keys": asset_subselection,\n "selected_asset_check_keys": asset_check_subselection,\n }\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n
[docs] @public\n def to_source_assets(self) -> Sequence[SourceAsset]:\n """Returns a SourceAsset for each asset in this definition.\n\n Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\n corresponding asset\n """\n return [\n self._output_to_source_asset(output_name)\n for output_name in self.keys_by_output_name.keys()\n ]
\n\n
[docs] @public\n def to_source_asset(self, key: Optional[CoercibleToAssetKey] = None) -> SourceAsset:\n """Returns a representation of this asset as a :py:class:`SourceAsset`.\n\n If this is a multi-asset, the "key" argument allows selecting which asset to return a\n SourceAsset representation of.\n\n Args:\n key (Optional[Union[str, Sequence[str], AssetKey]]]): If this is a multi-asset, select\n which asset to return a SourceAsset representation of. If not a multi-asset, this\n can be left as None.\n\n Returns:\n SourceAsset\n """\n if len(self.keys) > 1:\n check.invariant(\n key is not None,\n "The 'key' argument is required when there are multiple assets to choose from",\n )\n\n if key is not None:\n resolved_key = AssetKey.from_coercible(key)\n check.invariant(\n resolved_key in self.keys, f"Key {resolved_key} not found in AssetsDefinition"\n )\n else:\n resolved_key = self.key\n\n output_names = [\n output_name\n for output_name, ak in self.keys_by_output_name.items()\n if ak == resolved_key\n ]\n check.invariant(len(output_names) == 1)\n return self._output_to_source_asset(output_names[0])
\n\n def _output_to_source_asset(self, output_name: str) -> SourceAsset:\n with disable_dagster_warnings():\n output_def = self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0]\n key = self._keys_by_output_name[output_name]\n\n return SourceAsset(\n key=key,\n metadata=output_def.metadata,\n io_manager_key=output_def.io_manager_key,\n description=output_def.description,\n resource_defs=self.resource_defs,\n partitions_def=self.partitions_def,\n group_name=self.group_names_by_key[key],\n )\n\n def get_io_manager_key_for_asset_key(self, key: AssetKey) -> str:\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0].io_manager_key\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n yield from self.node_def.get_resource_requirements() # type: ignore[attr-defined]\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this AssetsDefinition."""\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n def __str__(self):\n if len(self.keys) == 1:\n return f"AssetsDefinition with key {self.key.to_string()}"\n else:\n asset_keys = ", ".join(sorted(([asset_key.to_string() for asset_key in self.keys])))\n return f"AssetsDefinition with keys {asset_keys}"\n\n @property\n def unique_id(self) -> str:\n """A unique identifier for the AssetsDefinition that's stable across processes."""\n return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()\n\n def with_resources(self, resource_defs: Mapping[str, ResourceDefinition]) -> "AssetsDefinition":\n attributes_dict = self.get_attributes_dict()\n attributes_dict["resource_defs"] = merge_resource_defs(\n old_resource_defs=self.resource_defs,\n resource_defs_to_merge_in=resource_defs,\n requires_resources=self,\n )\n return self.__class__(**attributes_dict)\n\n def get_attributes_dict(self) -> Dict[str, Any]:\n return dict(\n keys_by_input_name=self._keys_by_input_name,\n keys_by_output_name=self._keys_by_output_name,\n node_def=self._node_def,\n partitions_def=self._partitions_def,\n partition_mappings=self._partition_mappings,\n asset_deps=self.asset_deps,\n selected_asset_keys=self._selected_asset_keys,\n can_subset=self._can_subset,\n resource_defs=self._resource_defs,\n group_names_by_key=self._group_names_by_key,\n metadata_by_key=self._metadata_by_key,\n freshness_policies_by_key=self._freshness_policies_by_key,\n auto_materialize_policies_by_key=self._auto_materialize_policies_by_key,\n backfill_policy=self._backfill_policy,\n descriptions_by_key=self._descriptions_by_key,\n check_specs_by_output_name=self._check_specs_by_output_name,\n selected_asset_check_keys=self._selected_asset_check_keys,\n )
\n\n\ndef _infer_keys_by_input_names(\n node_def: Union["GraphDefinition", OpDefinition], keys_by_input_name: Mapping[str, AssetKey]\n) -> Mapping[str, AssetKey]:\n all_input_names = [input_def.name for input_def in node_def.input_defs]\n if keys_by_input_name:\n check.invariant(\n set(keys_by_input_name.keys()) == set(all_input_names),\n "The set of input names keys specified in the keys_by_input_name argument must "\n f"equal the set of asset keys inputted by '{node_def.name}'. \\n"\n f"keys_by_input_name keys: {set(keys_by_input_name.keys())} \\n"\n f"expected keys: {all_input_names}",\n )\n\n # If asset key is not supplied in keys_by_input_name, create asset key\n # from input name\n inferred_input_names_by_asset_key: Dict[str, AssetKey] = {\n input_name: keys_by_input_name.get(input_name, AssetKey([input_name]))\n for input_name in all_input_names\n }\n\n return inferred_input_names_by_asset_key\n\n\ndef _infer_keys_by_output_names(\n node_def: Union["GraphDefinition", OpDefinition],\n keys_by_output_name: Mapping[str, AssetKey],\n check_specs_by_output_name: Mapping[str, AssetCheckSpec],\n) -> Mapping[str, AssetKey]:\n output_names = [output_def.name for output_def in node_def.output_defs]\n if keys_by_output_name:\n overlapping_asset_and_check_outputs = set(keys_by_output_name.keys()) & set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n not overlapping_asset_and_check_outputs,\n "The set of output names associated with asset keys and checks overlap:"\n f" {overlapping_asset_and_check_outputs}",\n )\n\n union_asset_and_check_outputs = set(keys_by_output_name.keys()) | set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n union_asset_and_check_outputs == set(output_names),\n "The union of the set of output names keys specified in the keys_by_output_name and"\n " check_specs_by_output_name arguments must equal the set of asset keys outputted by"\n f" {node_def.name}. union keys:"\n f" {union_asset_and_check_outputs} \\nexpected keys: {set(output_names)}",\n )\n\n inferred_keys_by_output_names: Dict[str, AssetKey] = {\n output_name: asset_key for output_name, asset_key in keys_by_output_name.items()\n }\n\n if (\n len(output_names) == 1\n and output_names[0] not in keys_by_output_name\n and output_names[0] not in check_specs_by_output_name\n and output_names[0] == "result"\n ):\n # If there is only one output and the name is the default "result", generate asset key\n # from the name of the node\n inferred_keys_by_output_names[output_names[0]] = AssetKey([node_def.name])\n\n for output_name in output_names:\n if (\n output_name not in inferred_keys_by_output_names\n and output_name not in check_specs_by_output_name\n ):\n inferred_keys_by_output_names[output_name] = AssetKey([output_name])\n return inferred_keys_by_output_names\n\n\ndef _validate_graph_def(graph_def: "GraphDefinition", prefix: Optional[Sequence[str]] = None):\n """Ensure that all leaf nodes are mapped to graph outputs."""\n from dagster._core.definitions.graph_definition import GraphDefinition, create_adjacency_lists\n\n prefix = check.opt_sequence_param(prefix, "prefix")\n\n # recursively validate any sub-graphs\n for inner_node_def in graph_def.node_defs:\n if isinstance(inner_node_def, GraphDefinition):\n _validate_graph_def(inner_node_def, prefix=[*prefix, graph_def.name])\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph_def.nodes, graph_def.dependency_structure)\n leaf_nodes = {\n node_name for node_name, downstream_nodes in forward_edges.items() if not downstream_nodes\n }\n\n # set of nodes that have outputs mapped to a graph output\n mapped_output_nodes = {\n output_mapping.maps_from.node_name for output_mapping in graph_def.output_mappings\n }\n\n # leaf nodes which do not have an associated mapped output\n unmapped_leaf_nodes = {".".join([*prefix, node]) for node in leaf_nodes - mapped_output_nodes}\n\n check.invariant(\n not unmapped_leaf_nodes,\n f"All leaf nodes within graph '{graph_def.name}' must generate outputs which are mapped"\n " to outputs of the graph, and produce assets. The following leaf node(s) are"\n f" non-asset producing ops: {unmapped_leaf_nodes}. This behavior is not currently"\n " supported because these ops are not required for the creation of the associated"\n " asset(s).",\n )\n\n\ndef _validate_self_deps(\n input_keys: Iterable[AssetKey],\n output_keys: Iterable[AssetKey],\n partition_mappings: Mapping[AssetKey, PartitionMapping],\n partitions_def: Optional[PartitionsDefinition],\n) -> None:\n output_keys_set = set(output_keys)\n for input_key in input_keys:\n if input_key in output_keys_set:\n if input_key in partition_mappings:\n partition_mapping = partition_mappings[input_key]\n time_window_partition_mapping = get_self_dep_time_window_partition_mapping(\n partition_mapping, partitions_def\n )\n if (\n time_window_partition_mapping is not None\n and (time_window_partition_mapping.start_offset or 0) < 0\n and (time_window_partition_mapping.end_offset or 0) < 0\n ):\n continue\n\n raise DagsterInvalidDefinitionError(\n f'Asset "{input_key.to_user_string()}" depends on itself. Assets can only depend'\n " on themselves if they are:\\n(a) time-partitioned and each partition depends on"\n " earlier partitions\\n(b) multipartitioned, with one time dimension that depends"\n " on earlier time partitions"\n )\n\n\ndef get_self_dep_time_window_partition_mapping(\n partition_mapping: Optional[PartitionMapping], partitions_def: Optional[PartitionsDefinition]\n) -> Optional[TimeWindowPartitionMapping]:\n """Returns a time window partition mapping dimension of the provided partition mapping,\n if exists.\n """\n if isinstance(partition_mapping, TimeWindowPartitionMapping):\n return partition_mapping\n elif isinstance(partition_mapping, MultiPartitionMapping):\n if not isinstance(partitions_def, MultiPartitionsDefinition):\n return None\n\n time_partition_mapping = partition_mapping.downstream_mappings_by_upstream_dimension.get(\n partitions_def.time_window_dimension.name\n )\n\n if time_partition_mapping is None or not isinstance(\n time_partition_mapping.partition_mapping, TimeWindowPartitionMapping\n ):\n return None\n\n return time_partition_mapping.partition_mapping\n return None\n
", "current_page_name": "_modules/dagster/_core/definitions/assets", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.assets"}, "auto_materialize_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_policy

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Dict, FrozenSet, NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.auto_materialize_rule import (\n        AutoMaterializeRule,\n        AutoMaterializeRuleSnapshot,\n    )\n\n\nclass AutoMaterializePolicySerializer(NamedTupleSerializer):\n    def before_unpack(\n        self, context: UnpackContext, unpacked_dict: Dict[str, UnpackedValue]\n    ) -> Dict[str, UnpackedValue]:\n        from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n        backcompat_map = {\n            "on_missing": AutoMaterializeRule.materialize_on_missing(),\n            "on_new_parent_data": AutoMaterializeRule.materialize_on_parent_updated(),\n            "for_freshness": AutoMaterializeRule.materialize_on_required_for_freshness(),\n        }\n\n        # determine if this namedtuple was serialized with the old format (booleans for rules)\n        if any(backcompat_key in unpacked_dict for backcompat_key in backcompat_map):\n            # all old policies had these rules by default\n            rules = {\n                AutoMaterializeRule.skip_on_parent_outdated(),\n                AutoMaterializeRule.skip_on_parent_missing(),\n            }\n            for backcompat_key, rule in backcompat_map.items():\n                if unpacked_dict.get(backcompat_key):\n                    rules.add(rule)\n            unpacked_dict["rules"] = frozenset(rules)\n\n        return unpacked_dict\n\n\nclass AutoMaterializePolicyType(Enum):\n    EAGER = "EAGER"\n    LAZY = "LAZY"\n\n\n
[docs]@experimental\n@whitelist_for_serdes(\n old_fields={"time_window_partition_scope_minutes": 1e-6},\n serializer=AutoMaterializePolicySerializer,\n)\nclass AutoMaterializePolicy(\n NamedTuple(\n "_AutoMaterializePolicy",\n [\n ("rules", FrozenSet["AutoMaterializeRule"]),\n ("max_materializations_per_minute", Optional[int]),\n ],\n )\n):\n """An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.\n\n Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\n asset or a partition of an asset should or should not be auto-materialized.\n\n The most common policy is `AutoMaterializePolicy.eager()`, which consists of the following rules:\n\n - `AutoMaterializeRule.materialize_on_missing()`\n Materialize an asset or a partition if it has never been materialized.\n - `AutoMaterializeRule.materialize_on_parent_updated()`\n Materialize an asset or a partition if one of its parents have been updated more recently\n than it has.\n - `AutoMaterializeRule.materialize_on_required_for_freshness()`\n Materialize an asset or a partition if it is required to satisfy a freshness policy.\n - `AutoMaterializeRule.skip_on_parent_outdated()`\n Skip materializing an asset or partition if any of its parents have ancestors that have\n been materialized more recently.\n - `AutoMaterializeRule.skip_on_parent_missing()`\n Skip materializing an asset or a partition if any parent has never been materialized or\n observed.\n\n Policies can be customized by adding or removing rules. For example, if you'd like to allow\n an asset to be materialized even if some of its parent partitions are missing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().without_rules(\n AutoMaterializeRule.skip_on_parent_missing(),\n )\n\n If you'd like an asset to wait for all of its parents to be updated before materializing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().with_rules(\n AutoMaterializeRule.skip_on_all_parents_not_updated(),\n )\n\n Lastly, the `max_materializations_per_minute` parameter, which is set to 1 by default,\n rate-limits the number of auto-materializations that can occur for a particular asset within\n a short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\n safeguard against "surprise backfills", where user-error causes auto-materialize to be\n accidentally triggered for large numbers of partitions at once.\n\n **Warning:**\n\n Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\n AutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.\n\n """\n\n def __new__(\n cls,\n rules: AbstractSet["AutoMaterializeRule"],\n max_materializations_per_minute: Optional[int] = 1,\n ):\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n check.invariant(\n max_materializations_per_minute is None or max_materializations_per_minute > 0,\n "max_materializations_per_minute must be positive. To disable rate-limiting, set it"\n " to None. To disable auto materializing, remove the policy.",\n )\n\n return super(AutoMaterializePolicy, cls).__new__(\n cls,\n rules=frozenset(check.set_param(rules, "rules", of_type=AutoMaterializeRule)),\n max_materializations_per_minute=max_materializations_per_minute,\n )\n\n @property\n def materialize_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule\n for rule in self.rules\n if rule.decision_type == AutoMaterializeDecisionType.MATERIALIZE\n }\n\n @property\n def skip_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule for rule in self.rules if rule.decision_type == AutoMaterializeDecisionType.SKIP\n }\n\n
[docs] @public\n @staticmethod\n def eager(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs an eager AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_missing(),\n AutoMaterializeRule.materialize_on_parent_updated(),\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n @staticmethod\n def lazy(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs a lazy AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n def without_rules(self, *rules_to_remove: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules removed. Raises an error\n if any of the arguments are not rules in this policy.\n """\n non_matching_rules = set(rules_to_remove).difference(self.rules)\n check.param_invariant(\n not non_matching_rules,\n "rules_to_remove",\n f"Rules {[rule for rule in rules_to_remove if rule in non_matching_rules]} do not"\n " exist in this policy.",\n )\n return self._replace(\n rules=self.rules.difference(set(rules_to_remove)),\n )
\n\n
[docs] @public\n def with_rules(self, *rules_to_add: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules added."""\n return self._replace(rules=self.rules.union(set(rules_to_add)))
\n\n @property\n def policy_type(self) -> AutoMaterializePolicyType:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n if AutoMaterializeRule.materialize_on_parent_updated() in self.rules:\n return AutoMaterializePolicyType.EAGER\n return AutoMaterializePolicyType.LAZY\n\n @property\n def rule_snapshots(self) -> Sequence["AutoMaterializeRuleSnapshot"]:\n return [rule.to_snapshot() for rule in self.rules]
\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_policy"}, "auto_materialize_rule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_rule

\nimport datetime\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Dict,\n    FrozenSet,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey, AssetKeyPartitionKey\nfrom dagster._core.definitions.freshness_based_auto_materialize import (\n    freshness_evaluation_results_for_asset_key,\n)\nfrom dagster._core.definitions.partition_mapping import IdentityPartitionMapping\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    WhitelistMap,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.caching_instance_queryer import CachingInstanceQueryer\n\nfrom .asset_graph import AssetGraph, sort_key_for_asset_partition\nfrom .partition import SerializedPartitionsSubset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_daemon_context import AssetDaemonContext\n    from dagster._core.definitions.asset_daemon_cursor import AssetDaemonCursor\n    from dagster._core.instance import DynamicPartitionsStore\n\n\n@whitelist_for_serdes\nclass AutoMaterializeDecisionType(Enum):\n    """Represents the set of results of the auto-materialize logic.\n\n    MATERIALIZE: The asset should be materialized by a run kicked off on this tick\n    SKIP: The asset should not be materialized by a run kicked off on this tick, because future\n        ticks are expected to materialize it.\n    DISCARD: The asset should not be materialized by a run kicked off on this tick, but future\n        ticks are not expected to materialize it.\n    """\n\n    MATERIALIZE = "MATERIALIZE"\n    SKIP = "SKIP"\n    DISCARD = "DISCARD"\n\n\nclass AutoMaterializeRuleEvaluationData(ABC):\n    pass\n\n\n@whitelist_for_serdes\nclass TextRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple("_TextRuleEvaluationData", [("text", str)]),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass ParentUpdatedRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_ParentUpdatedRuleEvaluationData",\n        [\n            ("updated_asset_keys", FrozenSet[AssetKey]),\n            ("will_update_asset_keys", FrozenSet[AssetKey]),\n        ],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass WaitingOnAssetsRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_WaitingOnParentRuleEvaluationData",\n        [("waiting_on_asset_keys", FrozenSet[AssetKey])],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleSnapshot(NamedTuple):\n    """A serializable snapshot of an AutoMaterializeRule for historical evaluations."""\n\n    class_name: str\n    description: str\n    decision_type: AutoMaterializeDecisionType\n\n    @staticmethod\n    def from_rule(rule: "AutoMaterializeRule") -> "AutoMaterializeRuleSnapshot":\n        return AutoMaterializeRuleSnapshot(\n            class_name=rule.__class__.__name__,\n            description=rule.description,\n            decision_type=rule.decision_type,\n        )\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleEvaluation(NamedTuple):\n    rule_snapshot: AutoMaterializeRuleSnapshot\n    evaluation_data: Optional[AutoMaterializeRuleEvaluationData]\n\n\nclass RuleEvaluationContext(NamedTuple):\n    asset_key: AssetKey\n    cursor: "AssetDaemonCursor"\n    instance_queryer: CachingInstanceQueryer\n    data_time_resolver: CachingDataTimeResolver\n    will_materialize_mapping: Mapping[AssetKey, AbstractSet[AssetKeyPartitionKey]]\n    expected_data_time_mapping: Mapping[AssetKey, Optional[datetime.datetime]]\n    candidates: AbstractSet[AssetKeyPartitionKey]\n    daemon_context: "AssetDaemonContext"\n\n    @property\n    def asset_graph(self) -> AssetGraph:\n        return self.instance_queryer.asset_graph\n\n    def materializable_in_same_run(self, child_key: AssetKey, parent_key: AssetKey) -> bool:\n        """Returns whether a child asset can be materialized in the same run as a parent asset."""\n        from dagster._core.definitions.external_asset_graph import ExternalAssetGraph\n\n        return (\n            # both assets must be materializable\n            child_key in self.asset_graph.materializable_asset_keys\n            and parent_key in self.asset_graph.materializable_asset_keys\n            # the parent must have the same partitioning\n            and self.asset_graph.have_same_partitioning(child_key, parent_key)\n            # the parent must have a simple partition mapping to the child\n            and (\n                not self.asset_graph.is_partitioned(parent_key)\n                or isinstance(\n                    self.asset_graph.get_partition_mapping(child_key, parent_key),\n                    (TimeWindowPartitionMapping, IdentityPartitionMapping),\n                )\n            )\n            # the parent must be in the same repository to be materialized alongside the candidate\n            and (\n                not isinstance(self.asset_graph, ExternalAssetGraph)\n                or self.asset_graph.get_repository_handle(child_key)\n                == self.asset_graph.get_repository_handle(parent_key)\n            )\n        )\n\n    def get_parents_that_will_not_be_materialized_on_current_tick(\n        self, *, asset_partition: AssetKeyPartitionKey\n    ) -> AbstractSet[AssetKeyPartitionKey]:\n        """Returns the set of parent asset partitions that will not be updated in the same run of\n        this asset partition if we launch a run of this asset partition on this tick.\n        """\n        return {\n            parent\n            for parent in self.asset_graph.get_parents_partitions(\n                dynamic_partitions_store=self.instance_queryer,\n                current_time=self.instance_queryer.evaluation_time,\n                asset_key=asset_partition.asset_key,\n                partition_key=asset_partition.partition_key,\n            ).parent_partitions\n            if parent not in self.will_materialize_mapping.get(parent.asset_key, set())\n            or not self.materializable_in_same_run(asset_partition.asset_key, parent.asset_key)\n        }\n\n    def get_asset_partitions_by_asset_key(\n        self,\n        asset_partitions: AbstractSet[AssetKeyPartitionKey],\n    ) -> Mapping[AssetKey, Set[AssetKeyPartitionKey]]:\n        asset_partitions_by_asset_key: Dict[AssetKey, Set[AssetKeyPartitionKey]] = defaultdict(set)\n        for parent in asset_partitions:\n            asset_partitions_by_asset_key[parent.asset_key].add(parent)\n\n        return asset_partitions_by_asset_key\n\n\nRuleEvaluationResults = Sequence[Tuple[Optional[AutoMaterializeRuleEvaluationData], AbstractSet]]\n\n\n
[docs]class AutoMaterializeRule(ABC):\n """An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\n should be kicked off for a given asset partition.\n\n Each rule can have one of two decision types, `MATERIALIZE` (indicating that an asset partition\n should be materialized) or `SKIP` (indicating that the asset partition should not be\n materialized).\n\n Materialize rules are evaluated first, and skip rules operate over the set of candidates that\n are produced by the materialize rules. Other than that, there is no ordering between rules.\n """\n\n @abstractproperty\n def decision_type(self) -> AutoMaterializeDecisionType:\n """The decision type of the rule (either `MATERIALIZE` or `SKIP`)."""\n ...\n\n @abstractproperty\n def description(self) -> str:\n """A human-readable description of this rule. As a basic guideline, this string should\n complete the sentence: 'Indicates an asset should be (materialize/skipped) when ____'.\n """\n ...\n\n @abstractmethod\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """The core evaluation function for the rule. This function takes in a context object and\n returns a mapping from evaluated rules to the set of asset partitions that the rule applies\n to.\n """\n ...\n\n
[docs] @public\n @staticmethod\n def materialize_on_required_for_freshness() -> "MaterializeOnRequiredForFreshnessRule":\n """Materialize an asset partition if it is required to satisfy a freshness policy of this\n asset or one of its downstream assets.\n\n Note: This rule has no effect on partitioned assets.\n """\n return MaterializeOnRequiredForFreshnessRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_parent_updated() -> "MaterializeOnParentUpdatedRule":\n """Materialize an asset partition if one of its parents has been updated more recently\n than it has.\n\n Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\n asset, this rule will only fire for the most recent partition of the downstream.\n """\n return MaterializeOnParentUpdatedRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_missing() -> "MaterializeOnMissingRule":\n """Materialize an asset partition if it has never been materialized before. This rule will\n not fire for non-root assets unless that asset's parents have been updated.\n """\n return MaterializeOnMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_missing() -> "SkipOnParentMissingRule":\n """Skip materializing an asset partition if one of its parent asset partitions has never\n been materialized (for regular assets) or observed (for observable source assets).\n """\n return SkipOnParentMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_outdated() -> "SkipOnParentOutdatedRule":\n """Skip materializing an asset partition if any of its parents has not incorporated the\n latest data from its ancestors.\n """\n return SkipOnParentOutdatedRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_not_all_parents_updated(\n require_update_for_all_parent_partitions: bool = False,\n ) -> "SkipOnNotAllParentsUpdatedRule":\n """Skip materializing an asset partition if any of its parents have not been updated since\n the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n return SkipOnNotAllParentsUpdatedRule(require_update_for_all_parent_partitions)
\n\n def to_snapshot(self) -> AutoMaterializeRuleSnapshot:\n """Returns a serializable snapshot of this rule for historical evaluations."""\n return AutoMaterializeRuleSnapshot.from_rule(self)\n\n def __eq__(self, other) -> bool:\n # override the default NamedTuple __eq__ method to factor in types\n return type(self) == type(other) and super().__eq__(other)\n\n def __hash__(self) -> int:\n # override the default NamedTuple __hash__ method to factor in types\n return hash(hash(type(self)) + super().__hash__())
\n\n\n@whitelist_for_serdes\nclass MaterializeOnRequiredForFreshnessRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnRequiredForFreshnessRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "required to meet this or downstream asset's freshness policy"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n freshness_conditions = freshness_evaluation_results_for_asset_key(\n asset_key=context.asset_key,\n data_time_resolver=context.data_time_resolver,\n asset_graph=context.asset_graph,\n current_time=context.instance_queryer.evaluation_time,\n will_materialize_mapping=context.will_materialize_mapping,\n expected_data_time_mapping=context.expected_data_time_mapping,\n )\n return freshness_conditions\n\n\n@whitelist_for_serdes\nclass MaterializeOnParentUpdatedRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnParentUpdatedRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "upstream data has changed since latest materialization"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions of this asset whose parents have been updated,\n or will update on this tick.\n """\n conditions = defaultdict(set)\n has_parents_that_will_update = set()\n\n # first, get the set of parents that will be materialized this tick, and see if we\n # can materialize this asset with those parents\n will_update_parents_by_asset_partition = defaultdict(set)\n for parent_key in context.asset_graph.get_parents(context.asset_key):\n if not context.materializable_in_same_run(context.asset_key, parent_key):\n continue\n for parent_partition in context.will_materialize_mapping.get(parent_key, set()):\n asset_partition = AssetKeyPartitionKey(\n context.asset_key, parent_partition.partition_key\n )\n will_update_parents_by_asset_partition[asset_partition].add(parent_key)\n has_parents_that_will_update.add(asset_partition)\n\n # next, for each asset partition of this asset which has newly-updated parents, or\n # has a parent that will update, create a ParentUpdatedRuleEvaluationData\n has_or_will_update = (\n context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n )\n | has_parents_that_will_update\n )\n for asset_partition in has_or_will_update:\n parent_asset_partitions = context.asset_graph.get_parents_partitions(\n dynamic_partitions_store=context.instance_queryer,\n current_time=context.instance_queryer.evaluation_time,\n asset_key=asset_partition.asset_key,\n partition_key=asset_partition.partition_key,\n ).parent_partitions\n\n updated_parent_asset_partitions = context.instance_queryer.get_parent_asset_partitions_updated_after_child(\n asset_partition,\n parent_asset_partitions,\n # do a precise check for updated parents, factoring in data versions, as long as\n # we're within reasonable limits on the number of partitions to check\n respect_materialization_data_versions=context.daemon_context.respect_materialization_data_versions\n and len(parent_asset_partitions | has_or_will_update) < 100,\n # ignore self-dependencies when checking for updated parents, to avoid historical\n # rematerializations from causing a chain of materializations to be kicked off\n ignored_parent_keys={context.asset_key},\n )\n updated_parents = {parent.asset_key for parent in updated_parent_asset_partitions}\n will_update_parents = will_update_parents_by_asset_partition[asset_partition]\n\n if updated_parents or will_update_parents:\n conditions[\n ParentUpdatedRuleEvaluationData(\n updated_asset_keys=frozenset(updated_parents),\n will_update_asset_keys=frozenset(will_update_parents),\n )\n ].add(asset_partition)\n if conditions:\n return [(k, v) for k, v in conditions.items()]\n return []\n\n\n@whitelist_for_serdes\nclass MaterializeOnMissingRule(AutoMaterializeRule, NamedTuple("_MaterializeOnMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "materialization is missing"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions for this asset which are missing and were not\n previously discarded. Currently only applies to root asset partitions and asset partitions\n with updated parents.\n """\n missing_asset_partitions = (\n context.daemon_context.get_never_handled_root_asset_partitions_for_key(\n context.asset_key\n )\n )\n # in addition to missing root asset partitions, check any asset partitions with updated\n # parents to see if they're missing\n for (\n candidate\n ) in context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n ):\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n candidate\n ):\n missing_asset_partitions |= {candidate}\n if missing_asset_partitions:\n return [(None, missing_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentOutdatedRule(AutoMaterializeRule, NamedTuple("_SkipOnParentOutdatedRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be up to date"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n outdated_ancestors = set()\n # find the root cause of why this asset partition's parents are outdated (if any)\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n outdated_ancestors.update(\n context.instance_queryer.get_outdated_ancestors(asset_partition=parent)\n )\n if outdated_ancestors:\n asset_partitions_by_waiting_on_asset_keys[frozenset(outdated_ancestors)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentMissingRule(AutoMaterializeRule, NamedTuple("_SkipOnParentMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be present"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n missing_parent_asset_keys = set()\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n # ignore non-observable sources, which will never have a materialization or observation\n if context.asset_graph.is_source(\n parent.asset_key\n ) and not context.asset_graph.is_observable(parent.asset_key):\n continue\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n parent\n ):\n missing_parent_asset_keys.add(parent.asset_key)\n if missing_parent_asset_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(missing_parent_asset_keys)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnNotAllParentsUpdatedRule(\n AutoMaterializeRule,\n NamedTuple(\n "_SkipOnNotAllParentsUpdatedRule", [("require_update_for_all_parent_partitions", bool)]\n ),\n):\n """An auto-materialize rule that enforces that an asset can only be materialized if all parents\n have been materialized since the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n if self.require_update_for_all_parent_partitions is False:\n return "waiting on upstream data to be updated"\n else:\n return "waiting until all upstream partitions are updated"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n parent_partitions = context.asset_graph.get_parents_partitions(\n context.instance_queryer,\n context.instance_queryer.evaluation_time,\n context.asset_key,\n candidate.partition_key,\n ).parent_partitions\n\n updated_parent_partitions = (\n context.instance_queryer.get_parent_asset_partitions_updated_after_child(\n candidate,\n parent_partitions,\n context.daemon_context.respect_materialization_data_versions,\n ignored_parent_keys=set(),\n )\n | set().union(\n *[\n context.will_materialize_mapping.get(parent, set())\n for parent in context.asset_graph.get_parents(context.asset_key)\n ]\n )\n )\n\n if self.require_update_for_all_parent_partitions:\n # All upstream partitions must be updated in order for the candidate to be updated\n non_updated_parent_keys = {\n parent.asset_key for parent in parent_partitions - updated_parent_partitions\n }\n else:\n # At least one upstream partition in each upstream asset must be updated in order\n # for the candidate to be updated\n parent_asset_keys = context.asset_graph.get_parents(context.asset_key)\n updated_parent_partitions_by_asset_key = context.get_asset_partitions_by_asset_key(\n updated_parent_partitions\n )\n non_updated_parent_keys = {\n parent\n for parent in parent_asset_keys\n if not updated_parent_partitions_by_asset_key.get(parent)\n }\n\n # do not require past partitions of this asset to be updated\n non_updated_parent_keys -= {context.asset_key}\n\n if non_updated_parent_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(non_updated_parent_keys)].add(\n candidate\n )\n\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass DiscardOnMaxMaterializationsExceededRule(\n AutoMaterializeRule, NamedTuple("_DiscardOnMaxMaterializationsExceededRule", [("limit", int)])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.DISCARD\n\n @property\n def description(self) -> str:\n return f"exceeds {self.limit} materialization(s) per minute"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n # the set of asset partitions which exceed the limit\n rate_limited_asset_partitions = set(\n sorted(\n context.candidates,\n key=lambda x: sort_key_for_asset_partition(context.asset_graph, x),\n )[self.limit :]\n )\n if rate_limited_asset_partitions:\n return [(None, rate_limited_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass AutoMaterializeAssetEvaluation(NamedTuple):\n """Represents the results of the auto-materialize logic for a single asset.\n\n Properties:\n asset_key (AssetKey): The asset key that was evaluated.\n partition_subsets_by_condition: The rule evaluations that impact if the asset should be\n materialized, skipped, or discarded. If the asset is partitioned, this will be a list of\n tuples, where the first element is the condition and the second element is the\n serialized subset of partitions that the condition applies to. If it's not partitioned,\n the second element will be None.\n """\n\n asset_key: AssetKey\n partition_subsets_by_condition: Sequence[\n Tuple["AutoMaterializeRuleEvaluation", Optional[SerializedPartitionsSubset]]\n ]\n num_requested: int\n num_skipped: int\n num_discarded: int\n run_ids: Set[str] = set()\n rule_snapshots: Optional[Sequence[AutoMaterializeRuleSnapshot]] = None\n\n @staticmethod\n def from_rule_evaluation_results(\n asset_graph: AssetGraph,\n asset_key: AssetKey,\n asset_partitions_by_rule_evaluation: Sequence[\n Tuple[AutoMaterializeRuleEvaluation, AbstractSet[AssetKeyPartitionKey]]\n ],\n num_requested: int,\n num_skipped: int,\n num_discarded: int,\n dynamic_partitions_store: "DynamicPartitionsStore",\n ) -> "AutoMaterializeAssetEvaluation":\n auto_materialize_policy = asset_graph.auto_materialize_policies_by_key.get(asset_key)\n\n if not auto_materialize_policy:\n check.failed(f"Expected auto materialize policy on asset {asset_key}")\n\n partitions_def = asset_graph.get_partitions_def(asset_key)\n if partitions_def is None:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (rule_evaluation, None)\n for rule_evaluation, _ in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n else:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (\n rule_evaluation,\n SerializedPartitionsSubset.from_subset(\n subset=partitions_def.empty_subset().with_partition_keys(\n check.not_none(ap.partition_key) for ap in asset_partitions\n ),\n partitions_def=partitions_def,\n dynamic_partitions_store=dynamic_partitions_store,\n ),\n )\n for rule_evaluation, asset_partitions in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n\n\n# BACKCOMPAT GRAVEYARD\n\n\nclass BackcompatAutoMaterializeConditionSerializer(NamedTupleSerializer):\n """This handles backcompat for the old AutoMaterializeCondition objects, turning them into the\n proper AutoMaterializeRuleEvaluation objects. This is necessary because old\n AutoMaterializeAssetEvaluation objects will have serialized AutoMaterializeCondition objects,\n and we need to be able to deserialize them.\n\n In theory, as these serialized objects happen to be purged periodically, we can remove this\n backcompat logic at some point in the future.\n """\n\n def unpack(\n self,\n unpacked_dict: Dict[str, UnpackedValue],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> AutoMaterializeRuleEvaluation:\n if self.klass in (\n FreshnessAutoMaterializeCondition,\n DownstreamFreshnessAutoMaterializeCondition,\n ):\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_required_for_freshness().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == MissingAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_missing().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == ParentMaterializedAutoMaterializeCondition:\n updated_asset_keys = unpacked_dict.get("updated_asset_keys")\n if isinstance(updated_asset_keys, set):\n updated_asset_keys = cast(FrozenSet[AssetKey], frozenset(updated_asset_keys))\n else:\n updated_asset_keys = frozenset()\n will_update_asset_keys = unpacked_dict.get("will_update_asset_keys")\n if isinstance(will_update_asset_keys, set):\n will_update_asset_keys = cast(\n FrozenSet[AssetKey], frozenset(will_update_asset_keys)\n )\n else:\n will_update_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_parent_updated().to_snapshot(),\n evaluation_data=ParentUpdatedRuleEvaluationData(\n updated_asset_keys=updated_asset_keys,\n will_update_asset_keys=will_update_asset_keys,\n ),\n )\n elif self.klass == ParentOutdatedAutoMaterializeCondition:\n waiting_on_asset_keys = unpacked_dict.get("waiting_on_asset_keys")\n if isinstance(waiting_on_asset_keys, set):\n waiting_on_asset_keys = cast(FrozenSet[AssetKey], frozenset(waiting_on_asset_keys))\n else:\n waiting_on_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.skip_on_parent_outdated().to_snapshot(),\n evaluation_data=WaitingOnAssetsRuleEvaluationData(\n waiting_on_asset_keys=waiting_on_asset_keys\n ),\n )\n elif self.klass == MaxMaterializationsExceededAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=DiscardOnMaxMaterializationsExceededRule(limit=1).to_snapshot(),\n evaluation_data=None,\n )\n check.failed(f"Unexpected class {self.klass}")\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass FreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass DownstreamFreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentMaterializedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MissingAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentOutdatedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MaxMaterializationsExceededAutoMaterializeCondition(NamedTuple): ...\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_rule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_rule"}, "backfill_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.backfill_policy

\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass BackfillPolicyType(Enum):\n    SINGLE_RUN = "SINGLE_RUN"\n    MULTI_RUN = "MULTI_RUN"\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass BackfillPolicy(\n NamedTuple(\n "_BackfillPolicy",\n [\n ("max_partitions_per_run", Optional[int]),\n ],\n )\n):\n """A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.\n\n There are two main kinds of backfill policies: single-run and multi-run.\n\n An asset with a single-run backfill policy will take a single run to backfill all of its\n partitions at once.\n\n An asset with a multi-run backfill policy will take multiple runs to backfill all of its\n partitions. Each run will backfill a subset of the partitions. The number of partitions to\n backfill in each run is controlled by the `max_partitions_per_run` parameter.\n\n For example:\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 10, then it will\n be backfilled in 10 runs; each run will backfill 10 partitions.\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 11, then it will\n be backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\n will backfill the remaining 9 partitions.\n\n **Warning:**\n\n Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\n BackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\n recommended APIs.\n """\n\n def __new__(cls, max_partitions_per_run: Optional[int] = 1):\n return super(BackfillPolicy, cls).__new__(\n cls,\n max_partitions_per_run=max_partitions_per_run,\n )\n\n
[docs] @public\n @staticmethod\n def single_run() -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in a single run."""\n return BackfillPolicy(max_partitions_per_run=None)
\n\n
[docs] @public\n @staticmethod\n def multi_run(max_partitions_per_run: int = 1) -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in multiple runs.\n Each run will backfill [max_partitions_per_run] number of partitions.\n\n Args:\n max_partitions_per_run (Optional[int]): The maximum number of partitions in each run of\n the multiple runs. Defaults to 1.\n """\n return BackfillPolicy(\n max_partitions_per_run=check.int_param(max_partitions_per_run, "max_partitions_per_run")\n )
\n\n @property\n def policy_type(self) -> BackfillPolicyType:\n if self.max_partitions_per_run:\n return BackfillPolicyType.MULTI_RUN\n else:\n return BackfillPolicyType.SINGLE_RUN
\n
", "current_page_name": "_modules/dagster/_core/definitions/backfill_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.backfill_policy"}, "config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.config

\nfrom typing import Any, Callable, Mapping, NamedTuple, Optional, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    ConfigType,\n    is_supported_config_python_builtin,\n    process_config,\n    resolve_defaults,\n    validate_config,\n)\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nConfigMappingFn: TypeAlias = Callable[[Any], Any]\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Mapping[str, object]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the graph to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: ConfigMappingFn,\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/_core/definitions/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.config"}, "configurable": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, NamedTuple, Optional, Type, TypeVar, Union, cast\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    Field,\n    _check as check,\n)\nfrom dagster._config import EvaluateValueResult\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params\n\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and Graph config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: CoercableToConfigSchema = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self:\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[UserConfigSchema] = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self: ...\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> None:\n    from dagster._core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which"\n        " is produced by aliasing or tagging a node definition. To configure a node, you must"\n        " call `configured` on either an OpDefinition and GraphDefinition. To fix"\n        " this error, make sure to call `configured` on the definition object *before* using"\n        " the `tag` or `alias` methods. For usage examples, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        "Only the following types can be used with the `configured` method: ResourceDefinition,"\n        " ExecutorDefinition, GraphDefinition, NodeDefinition, and LoggerDefinition."\n        " For usage examples of `configured`, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n\n\nT_Configurable = TypeVar(\n    "T_Configurable", bound=Union["AnonymousConfigurableDefinition", "NamedConfigurableDefinition"]\n)\n\n\nclass FunctionAndConfigSchema(NamedTuple):\n    function: Callable[[Any], Any]\n    config_schema: Optional[UserConfigSchema]\n\n\ndef _wrap_user_fn_if_pythonic_config(\n    user_fn: Any, config_schema: Optional[UserConfigSchema]\n) -> FunctionAndConfigSchema:\n    """Helper function which allows users to provide a Pythonic config object to a @configurable\n    function. Detects if the function has a single parameter annotated with a Config class.\n    If so, wraps the function to convert the config dictionary into the appropriate Config object.\n    """\n    from dagster._config.pythonic_config import (\n        Config,\n        infer_schema_from_config_annotation,\n        safe_is_subclass,\n    )\n\n    if not isinstance(user_fn, Callable):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    config_fn_params = get_function_params(user_fn)\n    check.invariant(\n        len(config_fn_params) == 1, "@configured function should have exactly one parameter"\n    )\n\n    param = config_fn_params[0]\n\n    # If the parameter is a subclass of Config, we can infer the config schema from the\n    # type annotation. We'll also wrap the config mapping function to convert the config\n    # dictionary into the appropriate Config object.\n    if not safe_is_subclass(param.annotation, Config):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    check.invariant(\n        config_schema is None,\n        "Cannot provide config_schema to @configured function with Config-annotated param",\n    )\n\n    config_schema_from_class = infer_schema_from_config_annotation(param.annotation, param.default)\n    config_cls = cast(Type[Config], param.annotation)\n\n    param_name = param.name\n\n    def wrapped_fn(config_as_dict) -> Any:\n        config_input = config_cls(**config_as_dict)\n        output = user_fn(**{param_name: config_input})\n\n        if isinstance(output, Config):\n            return output._convert_to_config_dictionary()  # noqa: SLF001\n        else:\n            return output\n\n    return FunctionAndConfigSchema(function=wrapped_fn, config_schema=config_schema_from_class)\n\n\n
[docs]def configured(\n configurable: T_Configurable,\n config_schema: Optional[UserConfigSchema] = None,\n **kwargs: Any,\n) -> Callable[[object], T_Configurable]:\n """A decorator that makes it easy to create a function-configured version of an object.\n\n The following definition types can be configured using this function:\n\n * :py:class:`GraphDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`OpDefinition`\n\n Using ``configured`` may result in config values being displayed in the Dagster UI,\n so it is not recommended to use this API with sensitive values, such as\n secrets.\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy. Alternatively, annotate the config parameter to the decorated function\n with a subclass of :py:class:`Config` and omit this argument.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n class GreetingConfig(Config):\n message: str\n\n @op\n def greeting_op(config: GreetingConfig):\n print(config.message)\n\n class HelloConfig(Config):\n name: str\n\n @configured(greeting_op)\n def hello_op(config: HelloConfig):\n return GreetingConfig(message=f"Hello, {config.name}!")\n\n .. code-block:: python\n\n dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(S3Resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(S3Resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n\n """\n _check_configurable_param(configurable)\n\n if isinstance(configurable, NamedConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n fn_name = (\n getattr(config_or_config_fn, "__name__", None)\n if callable(config_or_config_fn)\n else None\n )\n name: str = check.not_none(kwargs.get("name") or fn_name)\n\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_or_config_fn=updated_fn,\n name=name,\n config_schema=new_config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n elif isinstance(configurable, AnonymousConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_schema=new_config_schema, config_or_config_fn=updated_fn, **kwargs\n )\n\n return _configured\n else:\n check.failed(f"Invalid configurable definition type: {type(configurable)}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/configurable", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.configurable"}, "decorators": {"asset_check_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_check_decorator

\nfrom typing import Any, Callable, Mapping, Optional, Set, Tuple, Union, cast\n\nfrom dagster import _check as check\nfrom dagster._annotations import experimental\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import (\n    AssetChecksDefinition,\n    AssetChecksDefinitionInputOutputProps,\n)\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import NoValueSentinel\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..input import In\nfrom .asset_decorator import (\n    get_function_params_without_context_or_config_or_resources,\n    stringify_asset_key_to_input_name,\n)\nfrom .op_decorator import _Op\n\nAssetCheckFunctionReturn = AssetCheckResult\nAssetCheckFunction = Callable[..., AssetCheckFunctionReturn]\n\n\ndef _build_asset_check_input(\n    name: str, asset_key: AssetKey, fn: Callable\n) -> Mapping[AssetKey, Tuple[str, In]]:\n    asset_params = get_function_params_without_context_or_config_or_resources(fn)\n\n    if len(asset_params) == 0:\n        input_name = stringify_asset_key_to_input_name(asset_key)\n        in_def = In(cast(type, Nothing))\n    elif len(asset_params) == 1:\n        input_name = asset_params[0].name\n        in_def = In(metadata={}, input_manager_key=None, dagster_type=NoValueSentinel)\n    else:\n        raise DagsterInvalidDefinitionError(\n            f"When defining check '{name}', multiple target assets provided as parameters:"\n            f" {[param.name for param in asset_params]}. Only one"\n            " is allowed."\n        )\n\n    return {\n        asset_key: (\n            input_name,\n            in_def,\n        )\n    }\n\n\n
[docs]@experimental\ndef asset_check(\n *,\n asset: Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset],\n name: Optional[str] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n compute_kind: Optional[str] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Callable[[AssetCheckFunction], AssetChecksDefinition]:\n """Create a definition for how to execute an asset check.\n\n Args:\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The\n asset that the check applies to.\n name (Optional[str]): The name of the check. If not specified, the name of the decorated\n function will be used. Checks for the same asset must have unique names.\n description (Optional[str]): The description of the check.\n required_resource_keys (Optional[Set[str]]): A set of keys for resources that are required\n by the function that execute the check. These can alternatively be specified by\n including resource-typed parameters in the function signature.\n config_schema (Optional[ConfigSchema): The configuration schema for the check's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that executes the check.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n compute_kind (Optional[str]): A string to represent the kind of computation that executes\n the check, e.g. "dbt" or "spark".\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that executes the check.\n\n\n Produces an :py:class:`AssetChecksDefinition` object.\n\n\n Example:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n\n @asset\n def my_asset() -> None:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows() -> AssetCheckResult:\n num_rows = ...\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n\n\n Example with a DataFrame Output:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n from pandas import DataFrame\n\n @asset\n def my_asset() -> DataFrame:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n num_rows = my_asset.shape[0]\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n """\n\n def inner(fn: AssetCheckFunction) -> AssetChecksDefinition:\n check.callable_param(fn, "fn")\n resolved_name = name or fn.__name__\n asset_key = AssetKey.from_coercible_or_definition(asset)\n\n out = Out(dagster_type=None)\n input_tuples_by_asset_key = _build_asset_check_input(resolved_name, asset_key, fn)\n if len(input_tuples_by_asset_key) == 0:\n raise DagsterInvalidDefinitionError(\n f"No target asset provided when defining check '{resolved_name}'"\n )\n\n if len(input_tuples_by_asset_key) > 1:\n raise DagsterInvalidDefinitionError(\n f"When defining check '{resolved_name}', Multiple target assets provided:"\n f" {[key.to_user_string() for key in input_tuples_by_asset_key.keys()]}. Only one"\n " is allowed."\n )\n\n resolved_asset_key = next(iter(input_tuples_by_asset_key.keys()))\n spec = AssetCheckSpec(\n name=resolved_name,\n description=description,\n asset=resolved_asset_key,\n )\n\n op_def = _Op(\n name=spec.get_python_identifier(),\n ins=dict(input_tuples_by_asset_key.values()),\n out=out,\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=config_schema,\n retry_policy=retry_policy,\n )(fn)\n\n checks_def = AssetChecksDefinition(\n node_def=op_def,\n resource_defs={},\n specs=[spec],\n input_output_props=AssetChecksDefinitionInputOutputProps(\n asset_keys_by_input_name={\n input_tuples_by_asset_key[resolved_asset_key][0]: resolved_asset_key\n },\n asset_check_keys_by_output_name={op_def.output_defs[0].name: spec.key},\n ),\n )\n\n return checks_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_check_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_check_decorator"}, "asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_decorator

\nfrom collections import Counter\nfrom inspect import Parameter\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, experimental_param\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping, MetadataUserInput\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom ..asset_check_spec import AssetCheckSpec\nfrom ..asset_in import AssetIn\nfrom ..asset_out import AssetOut\nfrom ..asset_spec import AssetSpec\nfrom ..assets import AssetsDefinition\nfrom ..backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom ..decorators.graph_decorator import graph\nfrom ..decorators.op_decorator import _Op\nfrom ..events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom ..input import GraphIn, In\nfrom ..output import GraphOut, Out\nfrom ..partition import PartitionsDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, NoValueSentinel\n\n\n@overload\ndef asset(\n    compute_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef asset(\n    *,\n    name: Optional[str] = ...,\n    key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    deps: Optional[Iterable[CoercibleToAssetDep]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[UserConfigSchema] = None,\n    required_resource_keys: Optional[Set[str]] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    io_manager_def: Optional[object] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    op_tags: Optional[Mapping[str, Any]] = ...,\n    group_name: Optional[str] = ...,\n    output_required: bool = ...,\n    freshness_policy: Optional[FreshnessPolicy] = ...,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n    backfill_policy: Optional[BackfillPolicy] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n    key: Optional[CoercibleToAssetKey] = None,\n    non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = ...,\n    check_specs: Optional[Sequence[AssetCheckSpec]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\n@experimental_param(param="auto_materialize_policy")\n@experimental_param(param="backfill_policy")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef asset(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_def: Optional[object] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n An asset has an op inside it to represent the function that computes it. The name of the op\n will be the segments of the asset key, separated by double-underscores.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in dagster (ie only contains\n letters, numbers, and _) and may not contain python reserved keywords.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used\n for storing the output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager"). Only one of io_manager_key and io_manager_def can be provided.\n io_manager_def (Optional[object]): (Experimental) The IOManager used for\n storing the output of the op as an asset, and for loading it in\n downstream ops. Only one of io_manager_def and io_manager_key can be provided.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n output_required (bool): Whether the decorated function will always materialize an asset.\n Defaults to True. If False, the function can return None, which will not be materialized to\n storage and will halt execution of downstream assets.\n freshness_policy (FreshnessPolicy): A constraint telling Dagster how often this asset is intended to be updated\n with respect to its root data.\n auto_materialize_policy (AutoMaterializePolicy): (Experimental) Configure Dagster to automatically materialize\n this asset according to its FreshnessPolicy and when upstream dependencies change.\n backfill_policy (BackfillPolicy): (Experimental) Configure Dagster to backfill this asset according to its\n BackfillPolicy.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code that generates this asset. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the asset.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead.\n Set of asset keys that are upstream dependencies, but do not pass an input to the asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n\n def create_asset():\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n key_prefix=key_prefix,\n ins=ins,\n deps=upstream_asset_deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n io_manager_key=io_manager_key,\n io_manager_def=io_manager_def,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n op_tags=op_tags,\n group_name=group_name,\n output_required=output_required,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n retry_policy=retry_policy,\n code_version=code_version,\n check_specs=check_specs,\n key=key,\n )\n\n if compute_fn is not None:\n return create_asset()(compute_fn)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n check.invariant(\n not (io_manager_key and io_manager_def),\n "Both io_manager_key and io_manager_def were provided to `@asset` decorator. Please"\n " provide one or the other. ",\n )\n return create_asset()(fn)\n\n return inner
\n\n\ndef _resolve_key_and_name(\n *,\n key: Optional[CoercibleToAssetKey],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n name: Optional[str],\n decorator: str,\n fn: Callable[..., Any],\n) -> Tuple[AssetKey, str]:\n if (name or key_prefix) and key:\n raise DagsterInvalidDefinitionError(\n f"Cannot specify a name or key prefix for {decorator} when the key"\n " argument is provided."\n )\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n key = AssetKey.from_coercible(key) if key else None\n assigned_name = name or fn.__name__\n return (\n (\n # the filter here appears unnecessary per typing, but this exists\n # historically so keeping it here to be conservative in case users\n # can get Nones into the key_prefix_list somehow\n AssetKey(list(filter(None, [*(key_prefix_list or []), assigned_name])))\n if not key\n else key\n ),\n assigned_name,\n )\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[AssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ):\n self.name = name\n self.key_prefix = key_prefix\n self.ins = ins or {}\n self.deps = deps or []\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self.io_manager_key = io_manager_key\n self.io_manager_def = io_manager_def\n self.config_schema = config_schema\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.op_tags = op_tags\n self.resource_defs = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n self.group_name = group_name\n self.output_required = output_required\n self.freshness_policy = freshness_policy\n self.retry_policy = retry_policy\n self.auto_materialize_policy = auto_materialize_policy\n self.backfill_policy = backfill_policy\n self.code_version = code_version\n self.check_specs = check_specs\n self.key = key\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n from dagster._config.pythonic_config import (\n validate_resource_annotated_function,\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n validate_resource_annotated_function(fn)\n\n asset_ins = build_asset_ins(fn, self.ins or {}, {dep.asset_key for dep in self.deps})\n\n out_asset_key, asset_name = _resolve_key_and_name(\n key=self.key,\n key_prefix=self.key_prefix,\n name=self.name,\n fn=fn,\n decorator="@asset",\n )\n\n with disable_dagster_warnings():\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n\n bare_required_resource_keys = set(self.required_resource_keys)\n\n resource_defs_dict = self.resource_defs\n resource_defs_keys = set(resource_defs_dict.keys())\n decorator_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n io_manager_key = self.io_manager_key\n if self.io_manager_def:\n if not io_manager_key:\n io_manager_key = out_asset_key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in self.resource_defs\n and self.resource_defs[io_manager_key] != self.io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = self.io_manager_def\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n check.param_invariant(\n len(bare_required_resource_keys) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @asset decorator and as arguments"\n " to the decorated function",\n )\n\n io_manager_key = cast(str, io_manager_key) if io_manager_key else DEFAULT_IO_MANAGER_KEY\n\n out = Out(\n metadata=self.metadata or {},\n io_manager_key=io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n description=self.description,\n is_required=self.output_required,\n code_version=self.code_version,\n )\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n self.check_specs, [out_asset_key]\n )\n check_outs: Mapping[str, Out] = {\n output_name: Out(dagster_type=None)\n for output_name in check_specs_by_output_name.keys()\n }\n\n op_required_resource_keys = decorator_resource_keys - arg_resource_keys\n\n op = _Op(\n name=out_asset_key.to_python_identifier(),\n description=self.description,\n ins=dict(asset_ins.values()),\n out={DEFAULT_OUTPUT: out, **check_outs},\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": self.compute_kind} if self.compute_kind else {}),\n **(self.op_tags or {}),\n },\n config_schema=self.config_schema,\n retry_policy=self.retry_policy,\n code_version=self.code_version,\n )(fn)\n\n # check backfill policy is BackfillPolicyType.SINGLE_RUN for non-partitioned asset\n if self.partitions_def is None:\n check.param_invariant(\n (\n self.backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if self.backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in self.ins.items()\n if asset_in.partition_mapping is not None\n }\n\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=self.deps, asset_name=asset_name\n )\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n node_def=op,\n partitions_def=self.partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n resource_defs=wrapped_resource_defs,\n group_names_by_key={out_asset_key: self.group_name} if self.group_name else None,\n freshness_policies_by_key=(\n {out_asset_key: self.freshness_policy} if self.freshness_policy else None\n ),\n auto_materialize_policies_by_key=(\n {out_asset_key: self.auto_materialize_policy}\n if self.auto_materialize_policy\n else None\n ),\n backfill_policy=self.backfill_policy,\n asset_deps=None, # no asset deps in single-asset decorator\n selected_asset_keys=None, # no subselection in decorator\n can_subset=False,\n metadata_by_key={out_asset_key: self.metadata} if self.metadata else None,\n # see comment in @multi_asset's call to dagster_internal_init for the gory details\n # this is best understood as an _override_ which @asset does not support\n descriptions_by_key=None,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n\n
[docs]@experimental_param(param="resource_defs")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef multi_asset(\n *,\n outs: Optional[Mapping[str, AssetOut]] = None,\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_name: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n specs: Optional[Sequence[AssetSpec]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n # deprecated\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\n on an individual asset within the multi-asset by attaching them to the :py:class:`AssetOut`\n corresponding to that asset in the `outs` parameter.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\n this function. AssetOuts detail the output, IO management, and core asset properties.\n This argument is required except when AssetSpecs are used.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the underlying op.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the op that computes the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n can_subset (bool): If this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the multi-asset. If set,\n this is used as a default code version for all defined assets.\n specs (Optional[Sequence[AssetSpec]]): (Experimental) The specifications for the assets materialized\n by this function.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the assets.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are upstream\n dependencies, but do not pass an input to the multi_asset.\n\n Examples:\n .. code-block:: python\n\n # Use IO managers to handle I/O:\n @multi_asset(\n outs={\n "my_string_asset": AssetOut(),\n "my_int_asset": AssetOut(),\n }\n )\n def my_function(upstream_asset: int):\n result = upstream_asset + 1\n return str(result), result\n\n # Handle I/O on your own:\n @multi_asset(\n outs={\n "asset1": AssetOut(),\n "asset2": AssetOut(),\n },\n deps=["asset0"],\n )\n def my_function():\n asset0_value = load(path="asset0")\n asset1_result, asset2_result = do_some_transformation(asset0_value)\n write(asset1_result, path="asset1")\n write(asset2_result, path="asset2")\n return None, None\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n specs = check.opt_list_param(specs, "specs", of_type=AssetSpec)\n\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs", key_type=str)\n )\n\n _config_schema = check.opt_mapping_param(\n config_schema, # type: ignore\n "config_schema",\n additional_message="Only dicts are supported for asset config_schema.",\n )\n\n bare_required_resource_keys = set(required_resource_keys)\n resource_defs_keys = set(resource_defs.keys())\n required_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n asset_out_map: Mapping[str, AssetOut] = {} if outs is None else outs\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n op_name = name or fn.__name__\n\n if asset_out_map and specs:\n raise DagsterInvalidDefinitionError("Must specify only outs or specs but not both.")\n elif specs:\n output_tuples_by_asset_key = {}\n for asset_spec in specs:\n # output names are asset keys joined with _\n output_name = "_".join(asset_spec.key.path)\n output_tuples_by_asset_key[asset_spec.key] = (\n output_name,\n Out(\n Nothing,\n is_required=not (can_subset or asset_spec.skippable),\n description=asset_spec.description,\n ),\n )\n if upstream_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass deps and specs to @multi_asset, specify deps on the AssetSpecs"\n " directly."\n )\n if internal_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass internal_asset_deps and specs to @multi_asset, specify deps on"\n " the AssetSpecs directly."\n )\n\n upstream_keys = set()\n for spec in specs:\n for dep in spec.deps:\n if dep.asset_key not in output_tuples_by_asset_key:\n upstream_keys.add(dep.asset_key)\n if (\n dep.asset_key in output_tuples_by_asset_key\n and dep.partition_mapping is not None\n ):\n # self-dependent asset also needs to be considered an upstream_key\n upstream_keys.add(dep.asset_key)\n\n explicit_ins = ins or {}\n # get which asset keys have inputs set\n loaded_upstreams = build_asset_ins(fn, explicit_ins, deps=set())\n unexpected_upstreams = {\n key for key in loaded_upstreams.keys() if key not in upstream_keys\n }\n if unexpected_upstreams:\n raise DagsterInvalidDefinitionError(\n f"Asset inputs {unexpected_upstreams} do not have dependencies on the passed"\n " AssetSpec(s). Set the deps on the appropriate AssetSpec(s)."\n )\n remaining_upstream_keys = {key for key in upstream_keys if key not in loaded_upstreams}\n asset_ins = build_asset_ins(fn, explicit_ins, deps=remaining_upstream_keys)\n else:\n asset_ins = build_asset_ins(\n fn,\n ins or {},\n deps=(\n {dep.asset_key for dep in upstream_asset_deps} if upstream_asset_deps else set()\n ),\n )\n output_tuples_by_asset_key = build_asset_outs(asset_out_map)\n # validate that the asset_deps make sense\n valid_asset_deps = set(asset_ins.keys()) | set(output_tuples_by_asset_key.keys())\n for out_name, asset_keys in asset_deps.items():\n if asset_out_map and out_name not in asset_out_map:\n check.failed(\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument"\n f" for multi-asset {op_name}. Must be one of the outs for this multi-asset"\n f" {list(asset_out_map.keys())[:20]}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in"\n f" `internal_asset_deps` argument for multi-asset '{op_name}' on key"\n f" '{out_name}'. Each specified asset key must be associated with an input to"\n " the asset or produced by this asset. Valid keys:"\n f" {list(valid_asset_deps)[:20]}",\n )\n\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n check.param_invariant(\n len(bare_required_resource_keys or []) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @multi_asset decorator and as"\n " arguments to the decorated function",\n )\n\n asset_outs_by_output_name: Mapping[str, Out] = dict(output_tuples_by_asset_key.values())\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(output_tuples_by_asset_key.keys())\n )\n check_outs_by_output_name: Mapping[str, Out] = {\n output_name: Out(dagster_type=None, is_required=not can_subset)\n for output_name in check_specs_by_output_name.keys()\n }\n overlapping_output_names = (\n asset_outs_by_output_name.keys() & check_outs_by_output_name.keys()\n )\n check.invariant(\n len(overlapping_output_names) == 0,\n f"Check output names overlap with asset output names: {overlapping_output_names}",\n )\n combined_outs_by_output_name: Mapping[str, Out] = {\n **asset_outs_by_output_name,\n **check_outs_by_output_name,\n }\n\n with disable_dagster_warnings():\n op_required_resource_keys = required_resource_keys - arg_resource_keys\n\n op = _Op(\n name=op_name,\n description=description,\n ins=dict(asset_ins.values()),\n out=combined_outs_by_output_name,\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=_config_schema,\n retry_policy=retry_policy,\n code_version=code_version,\n )(fn)\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n keys_by_output_name = {\n output_name: asset_key\n for asset_key, (output_name, _) in output_tuples_by_asset_key.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping is not None\n }\n\n if upstream_asset_deps:\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=upstream_asset_deps, asset_name=op_name\n )\n\n if specs:\n internal_deps = {\n spec.key: {dep.asset_key for dep in spec.deps}\n for spec in specs\n if spec.deps is not None\n }\n props_by_asset_key: Mapping[AssetKey, Union[AssetSpec, AssetOut]] = {\n spec.key: spec for spec in specs\n }\n # Add PartitionMappings specified via AssetSpec.deps to partition_mappings dictionary. Error on duplicates\n for spec in specs:\n for dep in spec.deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" multi_asset {op_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n else:\n internal_deps = {keys_by_output_name[name]: asset_deps[name] for name in asset_deps}\n props_by_asset_key = {\n keys_by_output_name[output_name]: asset_out\n for output_name, asset_out in asset_out_map.items()\n }\n\n # handle properties defined ons AssetSpecs or AssetOuts\n group_names_by_key = {\n asset_key: props.group_name\n for asset_key, props in props_by_asset_key.items()\n if props.group_name is not None\n }\n if group_name:\n check.invariant(\n not group_names_by_key,\n "Cannot set group_name parameter on multi_asset if one or more of the"\n " AssetSpecs/AssetOuts supplied to this multi_asset have a group_name defined.",\n )\n group_names_by_key = {asset_key: group_name for asset_key in props_by_asset_key}\n\n freshness_policies_by_key = {\n asset_key: props.freshness_policy\n for asset_key, props in props_by_asset_key.items()\n if props.freshness_policy is not None\n }\n auto_materialize_policies_by_key = {\n asset_key: props.auto_materialize_policy\n for asset_key, props in props_by_asset_key.items()\n if props.auto_materialize_policy is not None\n }\n metadata_by_key = {\n asset_key: props.metadata\n for asset_key, props in props_by_asset_key.items()\n if props.metadata is not None\n }\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=op,\n asset_deps=internal_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n selected_asset_keys=None, # no subselection in decorator\n # descriptions by key is more accurately understood as _overriding_ the descriptions\n # by key that are in the OutputDefinitions associated with the asset key.\n # This is a dangerous construction liable for bugs. Instead there should be a\n # canonical source of asset descriptions in AssetsDefinintion and if we need\n # to create a memoized cached dictionary of asset keys for perf or something we do\n # that in the `__init__` or on demand.\n #\n # This is actually an override. We do not override descriptions\n # in OutputDefinitions in @multi_asset\n descriptions_by_key=None,\n metadata_by_key=metadata_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n return inner
\n\n\ndef get_function_params_without_context_or_config_or_resources(fn: Callable) -> List[Parameter]:\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_params = params[1:] if is_context_provided else params\n\n resource_arg_names = {arg.name for arg in get_resource_args(fn)}\n\n new_input_args = []\n for input_arg in input_params:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n\n return new_input_args\n\n\ndef stringify_asset_key_to_input_name(asset_key: AssetKey) -> str:\n return "_".join(asset_key.path).replace("-", "_")\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_ins: Mapping[str, AssetIn],\n deps: Optional[AbstractSet[AssetKey]],\n) -> Mapping[AssetKey, Tuple[str, In]]:\n """Creates a mapping from AssetKey to (name of input, In object)."""\n deps = check.opt_set_param(deps, "deps", AssetKey)\n\n new_input_args = get_function_params_without_context_or_config_or_resources(fn)\n\n non_var_input_param_names = [\n param.name for param in new_input_args if param.kind == Parameter.POSITIONAL_OR_KEYWORD\n ]\n has_kwargs = any(param.kind == Parameter.VAR_KEYWORD for param in new_input_args)\n\n all_input_names = set(non_var_input_param_names) | asset_ins.keys()\n\n if not has_kwargs:\n for in_key, asset_in in asset_ins.items():\n if in_key not in non_var_input_param_names and (\n not isinstance(asset_in.dagster_type, DagsterType)\n or not asset_in.dagster_type.is_nothing\n ):\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins_by_asset_key: Dict[AssetKey, Tuple[str, In]] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].key\n metadata = asset_ins[input_name].metadata or {}\n key_prefix = asset_ins[input_name].key_prefix\n input_manager_key = asset_ins[input_name].input_manager_key\n dagster_type = asset_ins[input_name].dagster_type\n else:\n metadata = {}\n key_prefix = None\n input_manager_key = None\n dagster_type = NoValueSentinel\n\n asset_key = asset_key or AssetKey(list(filter(None, [*(key_prefix or []), input_name])))\n\n ins_by_asset_key[asset_key] = (\n input_name.replace("-", "_"),\n In(metadata=metadata, input_manager_key=input_manager_key, dagster_type=dagster_type),\n )\n\n for asset_key in deps:\n if asset_key in ins_by_asset_key:\n raise DagsterInvalidDefinitionError(\n f"deps value {asset_key} also declared as input/AssetIn"\n )\n # mypy doesn't realize that Nothing is a valid type here\n ins_by_asset_key[asset_key] = (\n stringify_asset_key_to_input_name(asset_key),\n In(cast(type, Nothing)),\n )\n\n return ins_by_asset_key\n\n\n@overload\ndef graph_asset(\n compose_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef graph_asset(\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = ...,\n freshness_policy: Optional[FreshnessPolicy] = ...,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n backfill_policy: Optional[BackfillPolicy] = ...,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = ...,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]def graph_asset(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Creates a software-defined asset that's computed using a graph of ops.\n\n This decorator is meant to decorate a function that composes a set of ops or graphs to define\n the dependencies between them.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in Dagster (ie only contains\n letters, numbers, and underscores) and may not contain Python reserved keywords.\n description (Optional[str]):\n A human-readable description of the asset.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph underlying the asset is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\n contains letters, numbers, and underscores) and may not contain Python reserved keywords.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n metadata (Optional[MetadataUserInput]): Dictionary of metadata to be associated with\n the asset.\n freshness_policy (Optional[FreshnessPolicy]): A constraint telling Dagster how often this asset is\n intended to be updated with respect to its root data.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): The AutoMaterializePolicy to use\n for this asset.\n backfill_policy (Optional[BackfillPolicy]): The BackfillPolicy to use for this asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @op\n def fetch_files_from_slack(context) -> pd.DataFrame:\n ...\n\n @op\n def store_files_in_table(files) -> None:\n files.to_sql(name="slack_files", con=create_db_connection())\n\n @graph_asset\n def slack_files_table():\n return store_files(fetch_files_from_slack())\n """\n if compose_fn is None:\n return lambda fn: graph_asset( # type: ignore # (decorator pattern)\n fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )\n else:\n return graph_asset_no_defaults(\n compose_fn=compose_fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )
\n\n\ndef graph_asset_no_defaults(\n *,\n compose_fn: Callable,\n name: Optional[str],\n description: Optional[str],\n ins: Optional[Mapping[str, AssetIn]],\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n group_name: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n metadata: Optional[MetadataUserInput],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n check_specs: Optional[Sequence[AssetCheckSpec]],\n key: Optional[CoercibleToAssetKey],\n) -> AssetsDefinition:\n ins = ins or {}\n asset_ins = build_asset_ins(compose_fn, ins or {}, set())\n out_asset_key, _asset_name = _resolve_key_and_name(\n key=key,\n key_prefix=key_prefix,\n name=name,\n decorator="@graph_asset",\n fn=compose_fn,\n )\n\n keys_by_input_name = {input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()}\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in ins.items()\n if asset_in.partition_mapping\n }\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, [out_asset_key]\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name: Mapping = {\n "result": GraphOut(),\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=out_asset_key.to_python_identifier(),\n description=description,\n config=config,\n ins={input_name: GraphIn() for _, (input_name, _) in asset_ins.items()},\n out=combined_outs_by_output_name,\n )(compose_fn)\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n metadata_by_output_name={"result": metadata} if metadata else None,\n freshness_policies_by_output_name=(\n {"result": freshness_policy} if freshness_policy else None\n ),\n auto_materialize_policies_by_output_name=(\n {"result": auto_materialize_policy} if auto_materialize_policy else None\n ),\n backfill_policy=backfill_policy,\n descriptions_by_output_name={"result": description} if description else None,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n\n
[docs]def graph_multi_asset(\n *,\n outs: Mapping[str, AssetOut],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n group_name: Optional[str] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same graph of\n ops, and the same upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the graph.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the asset.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n can_subset (bool): Whether this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n """\n\n def inner(fn: Callable) -> AssetsDefinition:\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping\n }\n\n asset_ins = build_asset_ins(fn, ins or {}, set())\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n asset_outs = build_asset_outs(outs)\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(asset_outs.keys())\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name = {\n **{output_name: GraphOut() for output_name, _ in asset_outs.values()},\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=name or fn.__name__,\n out=combined_outs_by_output_name,\n )(fn)\n\n # source metadata from the AssetOuts (if any)\n metadata_by_output_name = {\n output_name: out.metadata\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.metadata is not None\n }\n\n # source freshness policies from the AssetOuts (if any)\n freshness_policies_by_output_name = {\n output_name: out.freshness_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.freshness_policy is not None\n }\n\n # source auto materialize policies from the AssetOuts (if any)\n auto_materialize_policies_by_output_name = {\n output_name: out.auto_materialize_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.auto_materialize_policy is not None\n }\n\n # source descriptions from the AssetOuts (if any)\n descriptions_by_output_name = {\n output_name: out.description\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.description is not None\n }\n\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={\n output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n },\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n can_subset=can_subset,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n descriptions_by_output_name=descriptions_by_output_name,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n return inner
\n\n\ndef build_asset_outs(asset_outs: Mapping[str, AssetOut]) -> Mapping[AssetKey, Tuple[str, Out]]:\n """Creates a mapping from AssetKey to (name of output, Out object)."""\n outs_by_asset_key: Dict[AssetKey, Tuple[str, Out]] = {}\n for output_name, asset_out in asset_outs.items():\n out = asset_out.to_out()\n asset_key = asset_out.key or AssetKey(\n list(filter(None, [*(asset_out.key_prefix or []), output_name]))\n )\n\n outs_by_asset_key[asset_key] = (output_name.replace("-", "_"), out)\n\n return outs_by_asset_key\n\n\ndef _deps_and_non_argument_deps_to_asset_deps(\n deps: Optional[Iterable[CoercibleToAssetDep]],\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]],\n) -> Optional[Iterable[AssetDep]]:\n """Helper function for managing deps and non_argument_deps while non_argument_deps is still an accepted parameter.\n Ensures only one of deps and non_argument_deps is provided, then converts the deps to AssetDeps.\n """\n if non_argument_deps is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and non_argument_deps to @asset. Use only deps instead."\n )\n\n if deps is not None:\n return _make_asset_deps(deps)\n\n if non_argument_deps is not None:\n check.set_param(non_argument_deps, "non_argument_deps", of_type=(AssetKey, str))\n return _make_asset_deps(non_argument_deps)\n\n\ndef _make_asset_deps(deps: Optional[Iterable[CoercibleToAssetDep]]) -> Optional[Iterable[AssetDep]]:\n if deps is None:\n return None\n\n # expand any multi_assets into a list of keys\n all_deps = []\n for dep in deps:\n if isinstance(dep, AssetsDefinition) and len(dep.keys) > 1:\n all_deps.extend(dep.keys)\n else:\n all_deps.append(dep)\n\n with disable_dagster_warnings():\n dep_dict = {}\n for dep in all_deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys. If an asset is specified as a dependency more than once, only error if the\n # dependency is different (ie has a different PartitionMapping)\n if (\n asset_dep.asset_key in dep_dict.keys()\n and asset_dep != dep_dict[asset_dep.asset_key]\n ):\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once per"\n " asset."\n )\n dep_dict[asset_dep.asset_key] = asset_dep\n\n return list(dep_dict.values())\n\n\ndef _validate_and_assign_output_names_to_check_specs(\n check_specs: Optional[Sequence[AssetCheckSpec]], valid_asset_keys: Sequence[AssetKey]\n) -> Mapping[str, AssetCheckSpec]:\n check_specs_by_output_name = {spec.get_python_identifier(): spec for spec in check_specs or []}\n if check_specs and len(check_specs_by_output_name) != len(check_specs):\n duplicates = {\n item: count\n for item, count in Counter(\n [(spec.asset_key, spec.name) for spec in check_specs]\n ).items()\n if count > 1\n }\n\n raise DagsterInvalidDefinitionError(f"Duplicate check specs: {duplicates}")\n\n for spec in check_specs_by_output_name.values():\n if spec.asset_key not in valid_asset_keys:\n raise DagsterInvalidDefinitionError(\n f"Invalid asset key {spec.asset_key} in check spec {spec.name}. Must be one of"\n f" {valid_asset_keys}"\n )\n\n return check_specs_by_output_name\n\n\ndef _get_partition_mappings_from_deps(\n partition_mappings: Dict[AssetKey, PartitionMapping], deps: Iterable[AssetDep], asset_name: str\n):\n # Add PartitionMappings specified via AssetDeps to partition_mappings dictionary. Error on duplicates\n for dep in deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" asset {asset_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n return partition_mappings\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_decorator"}, "graph_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Mapping, Optional, Sequence, Union, overload\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    name: Optional[str]\n    description: Optional[str]\n    input_defs: Sequence[InputDefinition]\n    output_defs: Optional[Sequence[OutputDefinition]]\n    ins: Optional[Mapping[str, GraphIn]]\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]]\n    tags: Optional[Mapping[str, str]]\n    config_mapping: Optional[ConfigMapping]\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        ins: Optional[Mapping[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(compose_fn: Callable) -> GraphDefinition: ...\n\n\n@overload\ndef graph(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    ins: Optional[Mapping[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = ...,\n) -> _Graph: ...\n\n\n
[docs]def graph(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n ins: Optional[Mapping[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create an op graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the op graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Graph()(compose_fn)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/graph_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            f"'{fn.__name__}' decorated function does not have required positional "\n            f"parameter '{missing_positional}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'."\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    hook_fn: Callable,\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    hook_fn: Optional[Callable] = None,\n    *,\n    name: Optional[str] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the op\n        (context.op) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = f'{context.op_name} has materialized an asset {event.asset_key}.'\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if hook_fn is not None:\n        check.invariant(required_resource_keys is None)\n        return _Hook()(hook_fn)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(hook_fn: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef success_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def success_hook(\n hook_fn: Optional[SuccessOrFailureHookFn] = None,\n *,\n name: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: SuccessOrFailureHookFn) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if hook_fn is not None:\n check.invariant(required_resource_keys is None)\n return wrapper(hook_fn)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/hook_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Mapping, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..metadata import RawMetadataValue\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n    from ..run_config import RunConfig\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n        ] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        input_values: Optional[Mapping[str, object]] = None,\n    ):\n        from dagster._core.definitions.run_config import convert_config_input\n\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.metadata = metadata\n        self.resource_defs = resource_defs\n        self.config = convert_config_input(config)\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n        self.input_values = input_values\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            metadata=self.metadata,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n            input_values=self.input_values,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(compose_fn: Callable[..., Any]) -> JobDefinition: ...\n\n\n@overload\ndef job(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    config: Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    metadata: Optional[Mapping[str, RawMetadataValue]] = ...,\n    logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n    partitions_def: Optional["PartitionsDefinition"] = ...,\n    input_values: Optional[Mapping[str, object]] = ...,\n) -> _Job: ...\n\n\n
[docs]@deprecated_param(\n param="version_strategy",\n breaking_version="2.0",\n additional_warn_text="Use asset versioning instead.",\n)\ndef job(\n compose_fn: Optional[Callable[..., Any]] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n compose_fn (Callable[..., Any]:\n The decorated function. The body should contain op or graph invocations. Unlike op\n functions, does not accept a context argument.\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping[str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`RunConfig` object is provided, then it will be used directly as the run config\n for the job whenever the job is executed, similar to providing a dictionary.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoization will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(in1):\n return in1 + 1\n\n @job\n def job1():\n add_one(return_one())\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Job()(compose_fn)\n\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return _Job(\n name=name,\n description=description,\n resource_defs=wrap_resources_for_execution(resource_defs),\n config=config,\n tags=tags,\n metadata=metadata,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n input_values=input_values,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/job_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.op_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom inspect import Parameter\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import (\n    format_docstring_for_description,\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom dagster._core.definitions.inference import infer_input_props\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import DagsterTypeKind\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom ..input import In, InputDefinition\nfrom ..output import Out\nfrom ..policy import RetryPolicy\nfrom ..utils import DEFAULT_OUTPUT\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        code_version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Mapping[str, In]] = None,\n        out: Optional[Union[Out, Mapping[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within OpDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.code_version = code_version\n        self.retry_policy = retry_policy\n\n        # config will be checked within OpDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_mapping_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from dagster._config.pythonic_config import validate_resource_annotated_function\n\n        from ..op_definition import OpDefinition\n\n        validate_resource_annotated_function(fn)\n\n        if not self.name:\n            self.name = fn.__name__\n\n        compute_fn = (\n            DecoratedOpFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedOpFunction(decorated_fn=fn)\n        )\n\n        if compute_fn.has_config_arg():\n            check.param_invariant(\n                self.config_schema is None or self.config_schema == {},\n                "If the @op has a config arg, you cannot specify a config schema",\n            )\n\n            from dagster._config.pythonic_config import infer_schema_from_config_annotation\n\n            # Parse schema from the type annotation of the config arg\n            config_arg = compute_fn.get_config_arg()\n            config_arg_type = config_arg.annotation\n            config_arg_default = config_arg.default\n            self.config_schema = infer_schema_from_config_annotation(\n                config_arg_type, config_arg_default\n            )\n\n        outs: Optional[Mapping[str, Out]] = None\n        if self.out is not None and isinstance(self.out, Out):\n            outs = {DEFAULT_OUTPUT: self.out}\n        elif self.out is not None:\n            outs = check.mapping_param(self.out, "out", key_type=str, value_type=Out)\n\n        arg_resource_keys = {arg.name for arg in compute_fn.get_resource_args()}\n        decorator_resource_keys = set(self.required_resource_keys or [])\n        check.param_invariant(\n            len(decorator_resource_keys) == 0 or len(arg_resource_keys) == 0,\n            "Cannot specify resource requirements in both @op decorator and as arguments to the"\n            " decorated function",\n        )\n        resolved_resource_keys = decorator_resource_keys.union(arg_resource_keys)\n\n        op_def = OpDefinition.dagster_internal_init(\n            name=self.name,\n            ins=self.ins,\n            outs=outs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=resolved_resource_keys,\n            tags=self.tags,\n            code_version=self.code_version,\n            retry_policy=self.retry_policy,\n            version=None,  # code_version has replaced version\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\n@overload\ndef op(compute_fn: Callable[..., Any]) -> "OpDefinition": ...\n\n\n@overload\ndef op(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Mapping[str, In]] = ...,\n    out: Optional[Union[Out, Mapping[str, Out]]] = ...,\n    config_schema: Optional[UserConfigSchema] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n) -> _Op: ...\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead"\n)\ndef op(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, In]] = None,\n out: Optional[Union[Out, Mapping[str, Out]]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n) -> Union["OpDefinition", _Op]:\n """Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n code_version (Optional[str]): (Experimental) Version of the logic encapsulated by the op. If set,\n this is used as a default version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n Examples:\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n\n if compute_fn is not None:\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(compute_fn)\n\n return _Op(\n name=name,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n code_version=code_version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n\n\nclass DecoratedOpFunction(NamedTuple):\n """Wrapper around the decorated op function to provide commonly used util methods."""\n\n decorated_fn: Callable[..., Any]\n\n @property\n def name(self):\n return self.decorated_fn.__name__\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return is_context_provided(get_function_params(self.decorated_fn))\n\n def get_context_arg(self) -> Parameter:\n if self.has_context_arg():\n return get_function_params(self.decorated_fn)[0]\n check.failed("Requested context arg on function that does not have one")\n\n @lru_cache(maxsize=1)\n def _get_function_params(self) -> Sequence[Parameter]:\n return get_function_params(self.decorated_fn)\n\n def has_config_arg(self) -> bool:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return True\n\n return False\n\n def get_config_arg(self) -> Parameter:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return param\n\n check.failed("Requested config arg on function that does not have one")\n\n def get_resource_args(self) -> Sequence[Parameter]:\n return get_resource_args(self.decorated_fn)\n\n def positional_inputs(self) -> Sequence[str]:\n params = self._get_function_params()\n input_args = params[1:] if self.has_context_arg() else params\n resource_arg_names = [arg.name for arg in self.get_resource_args()]\n input_args_filtered = [\n input_arg\n for input_arg in input_args\n if input_arg.name != "config" and input_arg.name not in resource_arg_names\n ]\n return positional_arg_name_list(input_args_filtered)\n\n def has_var_kwargs(self) -> bool:\n params = self._get_function_params()\n # var keyword arg has to be the last argument\n return len(params) > 0 and param_is_var_keyword(params[-1])\n\n def get_output_annotation(self) -> Any:\n from ..inference import infer_output_props\n\n return infer_output_props(self.decorated_fn).annotation\n\n\nclass NoContextDecoratedOpFunction(DecoratedOpFunction):\n """Wrapper around a decorated op function, when the decorator does not permit a context\n parameter.\n """\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return False\n\n\ndef is_context_provided(params: Sequence[Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef resolve_checked_op_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedOpFunction,\n explicit_input_defs: Sequence[InputDefinition],\n exclude_nothing: bool,\n) -> Sequence[InputDefinition]:\n """Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedOpFunction): The decorated function, wrapped in the\n DecoratedOpFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n explicit_names = set()\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # filter out config arg\n resource_arg_names = {arg.name for arg in compute_fn.get_resource_args()}\n explicit_names = explicit_names - resource_arg_names\n\n if compute_fn.has_config_arg() or resource_arg_names:\n new_input_args = []\n for input_arg in input_args:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n input_args = new_input_args\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[Parameter], input_args):\n if param.kind == Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter"\n f" '{param.name}' that is one of the input_defs of type 'Nothing' which"\n " should not be included since no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have argument(s)"\n f" '{undeclared_inputs_printed}'. {decorator_name}-decorated functions should have a"\n " keyword argument for each of their Ins, except for Ins that have the Nothing"\n " dagster_type. Alternatively, they can accept **kwargs."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(input_def.combine_with_inferred(inferred_props[input_def.name]))\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n inferred_input_defs = [\n InputDefinition.create_from_inferred(inferred)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n ]\n\n if exclude_nothing:\n for in_def in inferred_input_defs:\n if in_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {in_def.name} is annotated with"\n f" {in_def.dagster_type.display_name} which is a type that represents passing"\n " no data. This type must be used via In() and no parameter should be included"\n f" in the {decorator_name} decorated function."\n )\n\n input_defs.extend(inferred_input_defs)\n\n return input_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/op_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.op_decorator"}, "repository_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.metadata import (\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..asset_checks import AssetChecksDefinition\nfrom ..executor_definition import ExecutorDefinition\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    PendingRepositoryDefinition,\n    PendingRepositoryListDefinition,\n    RepositoryData,\n    RepositoryDefinition,\n    RepositoryListDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\nfrom ..unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nT = TypeVar("T")\n\nRepositoryDictSpec: TypeAlias = Dict[str, Dict[str, RepositoryListDefinition]]\n\n\ndef _flatten(items: Iterable[Union[T, List[T]]]) -> Iterator[T]:\n    for x in items:\n        if isinstance(x, List):\n            # switch to `yield from _flatten(x)` to support multiple layers of nesting\n            yield from x\n        else:\n            yield x\n\n\nclass _Repository:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        metadata: Optional[Dict[str, RawMetadataValue]] = None,\n        default_executor_def: Optional[ExecutorDefinition] = None,\n        default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n        resource_key_mapping: Optional[Mapping[int, str]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.metadata = normalize_metadata(\n            check.opt_mapping_param(metadata, "metadata", key_type=str)\n        )\n        self.default_executor_def = check.opt_inst_param(\n            default_executor_def, "default_executor_def", ExecutorDefinition\n        )\n        self.default_logger_defs = check.opt_mapping_param(\n            default_logger_defs, "default_logger_defs", key_type=str, value_type=LoggerDefinition\n        )\n        self.top_level_resources = check.opt_mapping_param(\n            top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n        )\n        self.resource_key_mapping = check.opt_mapping_param(\n            resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n        )\n\n    @overload\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[RepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> RepositoryDefinition: ...\n\n    @overload\n    def __call__(\n        self, fn: Callable[[], Sequence[PendingRepositoryListDefinition]]\n    ) -> PendingRepositoryDefinition: ...\n\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[PendingRepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n        from dagster._core.definitions import AssetsDefinition, SourceAsset\n        from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Optional[Union[CachingRepositoryData, RepositoryData]]\n        if isinstance(repository_definitions, list):\n            bad_defns = []\n            repository_defns = []\n            defer_repository_data = False\n            for i, definition in enumerate(_flatten(repository_definitions)):\n                if isinstance(definition, CacheableAssetsDefinition):\n                    defer_repository_data = True\n                elif not isinstance(\n                    definition,\n                    (\n                        JobDefinition,\n                        ScheduleDefinition,\n                        UnresolvedPartitionedAssetScheduleDefinition,\n                        SensorDefinition,\n                        GraphDefinition,\n                        AssetsDefinition,\n                        SourceAsset,\n                        UnresolvedAssetJobDefinition,\n                        AssetChecksDefinition,\n                    ),\n                ):\n                    bad_defns.append((i, type(definition)))\n                else:\n                    repository_defns.append(definition)\n\n            if bad_defns:\n                bad_definitions_str = ", ".join(\n                    [f"value of type {type_} at index {i}" for i, type_ in bad_defns]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, "\n                    "ScheduleDefinition, SensorDefinition, "\n                    "AssetsDefinition, SourceAsset, or AssetChecksDefinition."\n                    f"Got {bad_definitions_str}."\n                )\n\n            repository_data = (\n                None\n                if defer_repository_data\n                else CachingRepositoryData.from_list(\n                    repository_defns,\n                    default_executor_def=self.default_executor_def,\n                    default_logger_defs=self.default_logger_defs,\n                    top_level_resources=self.top_level_resources,\n                    resource_key_mapping=self.resource_key_mapping,\n                )\n            )\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'schedules', 'sensors', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                f"'{key}'"\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        if isinstance(repository_definitions, list) and repository_data is None:\n            return PendingRepositoryDefinition(\n                self.name,\n                repository_definitions=list(_flatten(repository_definitions)),\n                description=self.description,\n                metadata=self.metadata,\n                default_executor_def=self.default_executor_def,\n                default_logger_defs=self.default_logger_defs,\n                _top_level_resources=self.top_level_resources,\n            )\n        else:\n            repository_def = RepositoryDefinition(\n                name=self.name,\n                description=self.description,\n                metadata=self.metadata,\n                repository_data=repository_data,\n            )\n\n            update_wrapper(repository_def, fn)\n            return repository_def\n\n\n@overload\ndef repository(\n    definitions_fn: Union[\n        Callable[[], Sequence[RepositoryListDefinition]], Callable[[], RepositoryDictSpec]\n    ],\n) -> RepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    definitions_fn: Callable[..., Sequence[PendingRepositoryListDefinition]]\n) -> PendingRepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    metadata: Optional[Dict[str, RawMetadataValue]] = ...,\n    default_executor_def: Optional[ExecutorDefinition] = ...,\n    default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = ...,\n    _resource_key_mapping: Optional[Mapping[int, str]] = ...,\n) -> _Repository: ...\n\n\n
[docs]def repository(\n definitions_fn: Optional[\n Union[\n Callable[[], Sequence[PendingRepositoryListDefinition]],\n Callable[[], RepositoryDictSpec],\n ]\n ] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load jobs or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[Dict[str, RawMetadataValue]]): Arbitrary metadata for the repository.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n\n Example:\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n # and custom metadata that will be displayed in the UI\n ######################################################################\n\n ...\n\n @repository(\n name='my_repo',\n metadata={\n 'team': 'Team A',\n 'repository_version': '1.2.3',\n 'environment': 'production',\n })\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule': make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_jobs(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n """\n if definitions_fn is not None:\n check.invariant(description is None)\n check.invariant(len(get_function_params(definitions_fn)) == 0)\n\n return _Repository()(definitions_fn)\n\n return _Repository(\n name=name,\n description=description,\n metadata=metadata,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=_top_level_resources,\n resource_key_mapping=_resource_key_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/repository_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.schedule_decorator

\nimport copy\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.sensor_definition import get_context_param_name\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._utils import ensure_gen\n\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import (\n    DecoratedScheduleFunction,\n    DefaultScheduleStatus,\n    RawScheduleEvaluationFunction,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n    has_at_least_one_parameter,\n    validate_and_get_schedule_resource_dict,\n)\nfrom ..target import ExecutableDefinition\nfrom ..utils import validate_tags\n\n\n
[docs]def schedule(\n cron_schedule: Union[str, Sequence[str]],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]] = None,\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawScheduleEvaluationFunction], ScheduleDefinition]:\n """Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n 6. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution time to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n that should execute when this schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def inner(fn: RawScheduleEvaluationFunction) -> ScheduleDefinition:\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n check.callable_param(fn, "fn")\n validate_resource_annotated_function(fn)\n\n schedule_name = name or fn.__name__\n\n validated_tags = None\n\n # perform upfront validation of schedule tags\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n validated_tags = validate_tags(tags, allow_reserved_tags=False)\n\n context_param_name = get_context_param_name(fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n "Error occurred during the execution of should_execute for schedule"\n f" {schedule_name}"\n ),\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n resources = validate_and_get_schedule_resource_dict(\n context.resources, schedule_name, resource_arg_names\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n context_param = {context_param_name: context} if context_param_name else {}\n result = fn(**context_param, **resources)\n\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = (\n validated_tags\n or (tags_fn and validate_tags(tags_fn(context), allow_reserved_tags=False))\n or None\n )\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n elif isinstance(result, list):\n yield from cast(List[RunRequest], result)\n else:\n # this is a run-request based decorated function\n yield from cast(RunRequestIterator, ensure_gen(result))\n\n has_context_arg = has_at_least_one_parameter(fn)\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition.dagster_internal_init(\n name=schedule_name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n run_config=None, # cannot supply run_config or run_config_fn to decorator\n run_config_fn=None,\n tags=None, # cannot supply tags or tags_fn to decorator\n tags_fn=None,\n should_execute=None, # already encompassed in evaluation_fn\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/schedule_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.sensor_decorator

\nimport collections.abc\nimport inspect\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..asset_sensor_definition import AssetSensorDefinition\nfrom ..events import AssetKey\nfrom ..multi_asset_sensor_definition import (\n    AssetMaterializationFunction,\n    MultiAssetMaterializationFunction,\n    MultiAssetSensorDefinition,\n)\nfrom ..run_request import SensorResult\nfrom ..sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunction,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom ..target import ExecutableDefinition\n\n\n
[docs]def sensor(\n job_name: Optional[str] = None,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawSensorEvaluationFunction], SensorDefinition]:\n """Creates a sensor where the decorated function is used as the sensor's evaluation function.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]):\n The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: RawSensorEvaluationFunction) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition.dagster_internal_init(\n name=name,\n job_name=job_name,\n evaluation_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[AssetMaterializationFunction,], AssetSensorDefinition,]:\n """Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n\n Example:\n .. code-block:: python\n\n from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n @asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\n def my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n return RunRequest(\n run_key=context.cursor,\n run_config={\n "ops": {\n "read_materialization": {\n "config": {\n "asset_key": asset_event.dagster_event.asset_key.path,\n }\n }\n }\n },\n )\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: AssetMaterializationFunction) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(*args, **kwargs) -> Any:\n result = fn(*args, **kwargs)\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif isinstance(result, SensorResult):\n if result.cursor:\n raise DagsterInvariantViolationError(\n f"Error in asset sensor {sensor_name}: Sensor returned a SensorResult"\n " with a cursor value. The cursor is managed by the asset sensor and"\n " should not be modified by a user."\n )\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{result} of type {type(result)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n # Preserve any resource arguments from the underlying function, for when we inspect the\n # wrapped function later on\n _wrapped_fn = update_wrapper(_wrapped_fn, wrapped=fn)\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n job_name=job_name,\n asset_materialization_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n return inner
\n\n\n
[docs]@experimental\ndef multi_asset_sensor(\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[MultiAssetMaterializationFunction,], MultiAssetSensorDefinition,]:\n """Creates an asset sensor that can monitor multiple assets.\n\n The decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets this\n sensor monitors. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n if not isinstance(monitored_assets, AssetSelection) and not (\n isinstance(monitored_assets, collections.abc.Sequence)\n and all(isinstance(el, AssetKey) for el in monitored_assets)\n ):\n check.failed(\n "The value passed to monitored_assets param must be either an AssetSelection"\n f" or a Sequence of AssetKeys, but was a {type(monitored_assets)}"\n )\n\n def inner(fn: MultiAssetMaterializationFunction) -> MultiAssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n sensor_def = MultiAssetSensorDefinition(\n name=sensor_name,\n monitored_assets=monitored_assets,\n job_name=job_name,\n asset_materialization_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n request_assets=request_assets,\n required_resource_keys=required_resource_keys,\n )\n update_wrapper(sensor_def, wrapped=fn)\n return sensor_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/sensor_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.sensor_decorator"}}, "definitions_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.definitions_class

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._config.pythonic_config import (\n    attach_resource_id_to_key_mapping,\n)\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.asset_graph import InternalAssetGraph\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._core.execution.with_resources import with_resources\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils.cached_method import cached_method\n\nfrom .assets import AssetsDefinition, SourceAsset\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .decorators import repository\nfrom .job_definition import JobDefinition, default_job_io_manager\nfrom .partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom .repository_definition import (\n    SINGLETON_REPOSITORY_NAME,\n    PendingRepositoryDefinition,\n    RepositoryDefinition,\n)\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n
[docs]@public\n@experimental\ndef create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """Create a named repository using the same arguments as :py:class:`Definitions`. In older\n versions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\n and jobs. There could be many repositories per code location. This was a complicated ontology but\n gave users a way to organize code locations that contained large numbers of heterogenous definitions.\n\n As a stopgap for those who both want to 1) use the new :py:class:`Definitions` API and 2) but still\n want multiple logical groups of assets in the same code location, we have introduced this function.\n\n Example usage:\n\n .. code-block:: python\n\n named_repo = create_repository_using_definitions_args(\n name="a_repo",\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n }\n )\n\n """\n return _create_repository_using_definitions_args(\n name=name,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )
\n\n\nclass _AttachedObjects(NamedTuple):\n jobs: Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]\n schedules: Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n sensors: Iterable[SensorDefinition]\n\n\ndef _io_manager_needs_replacement(job: JobDefinition, resource_defs: Mapping[str, Any]) -> bool:\n """Explicitly replace the default IO manager in jobs that don't specify one, if a top-level\n I/O manager is provided to Definitions.\n """\n return (\n job.resource_defs.get("io_manager") == default_job_io_manager\n and "io_manager" in resource_defs\n )\n\n\ndef _jobs_which_will_have_io_manager_replaced(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n resource_defs: Mapping[str, Any],\n) -> List[Union[JobDefinition, UnresolvedAssetJobDefinition]]:\n """Returns whether any jobs will have their I/O manager replaced by an `io_manager` override from\n the top-level `resource_defs` provided to `Definitions` in 1.3. We will warn users if this is\n the case.\n """\n jobs = jobs or []\n return [\n job\n for job in jobs\n if isinstance(job, JobDefinition) and _io_manager_needs_replacement(job, resource_defs)\n ]\n\n\ndef _attach_resources_to_jobs_and_instigator_jobs(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ],\n sensors: Optional[Iterable[SensorDefinition]],\n resource_defs: Mapping[str, Any],\n) -> _AttachedObjects:\n """Given a list of jobs, schedules, and sensors along with top-level resource definitions,\n attach the resource definitions to the jobs, schedules, and sensors which require them.\n """\n jobs = jobs or []\n schedules = schedules or []\n sensors = sensors or []\n\n # Add jobs in schedules and sensors as well\n jobs = [\n *jobs,\n *[\n schedule.job\n for schedule in schedules\n if isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and isinstance(schedule.job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n *[\n job\n for sensor in sensors\n if sensor.has_loadable_targets()\n for job in sensor.jobs\n if isinstance(job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n ]\n # Dedupe\n jobs = list({id(job): job for job in jobs}.values())\n\n # Find unsatisfied jobs\n unsatisfied_jobs = [\n job\n for job in jobs\n if isinstance(job, JobDefinition)\n and (\n job.is_missing_required_resources() or _io_manager_needs_replacement(job, resource_defs)\n )\n ]\n\n # Create a mapping of job id to a version of the job with the resource defs bound\n unsatisfied_job_to_resource_bound_job = {\n id(job): job.with_top_level_resources(\n {\n **resource_defs,\n **job.resource_defs,\n # special case for IO manager - the job-level IO manager does not take precedence\n # if it is the default and a top-level IO manager is provided\n **(\n {"io_manager": resource_defs["io_manager"]}\n if _io_manager_needs_replacement(job, resource_defs)\n else {}\n ),\n }\n )\n for job in jobs\n if job in unsatisfied_jobs\n }\n\n # Update all jobs to use the resource bound version\n jobs_with_resources = [\n unsatisfied_job_to_resource_bound_job[id(job)] if job in unsatisfied_jobs else job\n for job in jobs\n ]\n\n # Update all schedules and sensors to use the resource bound version\n updated_schedules = [\n (\n schedule.with_updated_job(unsatisfied_job_to_resource_bound_job[id(schedule.job)])\n if (\n isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and schedule.job in unsatisfied_jobs\n )\n else schedule\n )\n for schedule in schedules\n ]\n updated_sensors = [\n (\n sensor.with_updated_jobs(\n [\n (\n unsatisfied_job_to_resource_bound_job[id(job)]\n if job in unsatisfied_jobs\n else job\n )\n for job in sensor.jobs\n ]\n )\n if sensor.has_loadable_targets() and any(job in unsatisfied_jobs for job in sensor.jobs)\n else sensor\n )\n for sensor in sensors\n ]\n\n return _AttachedObjects(jobs_with_resources, updated_schedules, updated_sensors)\n\n\ndef _create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n):\n check.opt_iterable_param(\n assets, "assets", (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)\n )\n check.opt_iterable_param(\n schedules, "schedules", (ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition)\n )\n check.opt_iterable_param(sensors, "sensors", SensorDefinition)\n check.opt_iterable_param(jobs, "jobs", (JobDefinition, UnresolvedAssetJobDefinition))\n\n check.opt_inst_param(executor, "executor", (ExecutorDefinition, Executor))\n executor_def = (\n executor\n if isinstance(executor, ExecutorDefinition) or executor is None\n else ExecutorDefinition.hardcoded_executor(executor)\n )\n\n # Generate a mapping from each top-level resource instance ID to its resource key\n resource_key_mapping = {id(v): k for k, v in resources.items()} if resources else {}\n\n # Provide this mapping to each resource instance so that it can be used to resolve\n # nested resources\n resources_with_key_mapping = (\n {\n k: attach_resource_id_to_key_mapping(v, resource_key_mapping)\n for k, v in resources.items()\n }\n if resources\n else {}\n )\n\n resource_defs = wrap_resources_for_execution(resources_with_key_mapping)\n\n check.opt_mapping_param(loggers, "loggers", key_type=str, value_type=LoggerDefinition)\n\n # Binds top-level resources to jobs and any jobs attached to schedules or sensors\n (\n jobs_with_resources,\n schedules_with_resources,\n sensors_with_resources,\n ) = _attach_resources_to_jobs_and_instigator_jobs(jobs, schedules, sensors, resource_defs)\n\n @repository(\n name=name,\n default_executor_def=executor_def,\n default_logger_defs=loggers,\n _top_level_resources=resource_defs,\n _resource_key_mapping=resource_key_mapping,\n )\n def created_repo():\n return [\n *with_resources(assets or [], resource_defs),\n *with_resources(asset_checks or [], resource_defs),\n *(schedules_with_resources),\n *(sensors_with_resources),\n *(jobs_with_resources),\n ]\n\n return created_repo\n\n\n@deprecated(\n breaking_version="2.0",\n additional_warn_text=(\n "Instantiations can be removed. Since it's behavior is now the default, this class is now a"\n " no-op."\n ),\n)\nclass BindResourcesToJobs(list):\n """Used to instruct Dagster to bind top-level resources to jobs and any jobs attached to schedules\n and sensors. Now deprecated since this behavior is the default.\n """\n\n\n
[docs]class Definitions:\n """A set of definitions explicitly available and loadable by Dagster tools.\n\n Parameters:\n assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]):\n A list of assets. Assets can be created by annotating\n a function with :py:func:`@asset <asset>` or\n :py:func:`@observable_source_asset <observable_source_asset>`.\n Or they can by directly instantiating :py:class:`AssetsDefinition`,\n :py:class:`SourceAsset`, or :py:class:`CacheableAssetsDefinition`.\n\n asset_checks (Optional[Iterable[AssetChecksDefinition]]):\n A list of asset checks.\n\n schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]):\n List of schedules.\n\n sensors (Optional[Iterable[SensorDefinition]]):\n List of sensors, typically created with :py:func:`@sensor <sensor>`.\n\n jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]):\n List of jobs. Typically created with :py:func:`define_asset_job <define_asset_job>`\n or with :py:func:`@job <job>` for jobs defined in terms of ops directly.\n Jobs created with :py:func:`@job <job>` must already have resources bound\n at job creation time. They do not respect the `resources` argument here.\n\n resources (Optional[Mapping[str, Any]]): Dictionary of resources to bind to assets.\n The resources dictionary takes raw Python objects,\n not just instances of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n These resources will be automatically bound\n to any assets passed to this Definitions instance using\n :py:func:`with_resources <with_resources>`. Assets passed to Definitions with\n resources already bound using :py:func:`with_resources <with_resources>` will\n override this dictionary.\n\n executor (Optional[Union[ExecutorDefinition, Executor]]):\n Default executor for jobs. Individual jobs can override this and define their own executors\n by setting the executor on :py:func:`@job <job>` or :py:func:`define_asset_job <define_asset_job>`\n explicitly. This executor will also be used for materializing assets directly\n outside of the context of jobs. If an :py:class:`Executor` is passed, it is coerced into\n an :py:class:`ExecutorDefinition`.\n\n loggers (Optional[Mapping[str, LoggerDefinition]):\n Default loggers for jobs. Individual jobs\n can define their own loggers by setting them explictly.\n\n Example usage:\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n },\n asset_checks=[asset_one_check_one]\n )\n\n Dagster separates user-defined code from system tools such the web server and\n the daemon. Rather than loading code directly into process, a tool such as the\n webserver interacts with user-defined code over a serialization boundary.\n\n These tools must be able to locate and load this code when they start. Via CLI\n arguments or config, they specify a Python module to inspect.\n\n A Python module is loadable by Dagster tools if there is a top-level variable\n that is an instance of :py:class:`Definitions`.\n\n Before the introduction of :py:class:`Definitions`,\n :py:func:`@repository <repository>` was the API for organizing defintions.\n :py:class:`Definitions` provides a few conveniences for dealing with resources\n that do not apply to old-style :py:func:`@repository <repository>` declarations:\n\n * It takes a dictionary of top-level resources which are automatically bound\n (via :py:func:`with_resources <with_resources>`) to any asset passed to it.\n If you need to apply different resources to different assets, use legacy\n :py:func:`@repository <repository>` and use\n :py:func:`with_resources <with_resources>` as before.\n * The resources dictionary takes raw Python objects, not just instances\n of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n """\n\n def __init__(\n self,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n ):\n self._created_pending_or_normal_repo = _create_repository_using_definitions_args(\n name=SINGLETON_REPOSITORY_NAME,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )\n\n
[docs] @public\n def get_job_def(self, name: str) -> JobDefinition:\n """Get a job definition by name. If you passed in a an :py:class:`UnresolvedAssetJobDefinition`\n (return value of :py:func:`define_asset_job`) it will be resolved to a :py:class:`JobDefinition` when returned\n from this function.\n """\n check.str_param(name, "name")\n return self.get_repository_def().get_job(name)
\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_sensor_def(name)
\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_schedule_def(name)
\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.Definitions.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n\n Returns:\n The contents of an asset as a Python object.\n """\n return self.get_repository_def().load_asset_value(\n asset_key=asset_key,\n python_type=python_type,\n instance=instance,\n partition_key=partition_key,\n metadata=metadata,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with defs.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n """\n return self.get_repository_def().get_asset_value_loader(\n instance=instance,\n )
\n\n def get_all_job_defs(self) -> Sequence[JobDefinition]:\n """Get all the Job definitions in the code location."""\n return self.get_repository_def().get_all_jobs()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n return self.get_repository_def().has_implicit_global_asset_job_def()\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method when there is a single defined global asset job.\n This occurs when all assets in the code location use a single partitioning scheme.\n If there are multiple partitioning schemes you must use get_implicit_job_def_for_assets\n instead to access to the correct implicit asset one.\n """\n return self.get_repository_def().get_implicit_global_asset_job_def()\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n return self.get_repository_def().get_implicit_job_def_for_assets(asset_keys)\n\n def get_assets_def(self, key: CoercibleToAssetKey) -> AssetsDefinition:\n asset_key = AssetKey.from_coercible(key)\n for assets_def in self.get_asset_graph().assets:\n if asset_key in assets_def.keys:\n return assets_def\n\n raise DagsterInvariantViolationError(f"Could not find asset {asset_key}")\n\n @cached_method\n def get_repository_def(self) -> RepositoryDefinition:\n """Definitions is implemented by wrapping RepositoryDefinition. Get that underlying object\n in order to access an functionality which is not exposed on Definitions. This method\n also resolves a PendingRepositoryDefinition to a RepositoryDefinition.\n """\n return (\n self._created_pending_or_normal_repo.compute_repository_definition()\n if isinstance(self._created_pending_or_normal_repo, PendingRepositoryDefinition)\n else self._created_pending_or_normal_repo\n )\n\n def get_inner_repository_for_loading_process(\n self,\n ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """This method is used internally to access the inner repository during the loading process\n at CLI entry points. We explicitly do not want to resolve the pending repo because the entire\n point is to defer that resolution until later.\n """\n return self._created_pending_or_normal_repo\n\n def get_asset_graph(self) -> InternalAssetGraph:\n """Get the AssetGraph for this set of definitions."""\n return self.get_repository_def().asset_graph
\n
", "current_page_name": "_modules/dagster/_core/definitions/definitions_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.definitions_class"}, "dependency": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    DefaultDict,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\nfrom dagster._utils import hash_collection\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.op_definition import OpDefinition\n\n    from .asset_layer import AssetLayer\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n    from .resource_requirement import ResourceRequirement\n\nT_DependencyKey = TypeVar("T_DependencyKey", str, "NodeInvocation")\nDependencyMapping: TypeAlias = Mapping[T_DependencyKey, Mapping[str, "IDependencyDefinition"]]\n\n\n
[docs]class NodeInvocation(\n NamedTuple(\n "Node",\n [\n ("name", PublicAttr[str]),\n ("alias", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, Any]]),\n ("hook_defs", PublicAttr[AbstractSet[HookDefinition]]),\n ("retry_policy", PublicAttr[Optional[RetryPolicy]]),\n ],\n )\n):\n """Identifies an instance of a node in a graph dependency structure.\n\n Args:\n name (str): Name of the node of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the node. Necessary when there are\n multiple instances of the same node.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the node definition.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n node instance.\n\n Examples:\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n from dagster import job\n\n @job\n def my_job():\n other_name = some_op.alias('other_name')\n some_graph(other_name(some_op))\n\n """\n\n def __new__(\n cls,\n name: str,\n alias: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n alias=check.opt_str_param(alias, "alias"),\n tags=check.opt_mapping_param(tags, "tags", value_type=str, key_type=str),\n hook_defs=check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition),\n retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n )\n\n # Needs to be hashable because this class is used as a key in dependencies dicts\n def __hash__(self) -> int:\n if not hasattr(self, "_hash"):\n self._hash = hash_collection(self)\n return self._hash
\n\n\nclass Node(ABC):\n """Node invocation within a graph. Identified by its name inside the graph."""\n\n name: str\n definition: "NodeDefinition"\n graph_definition: "GraphDefinition"\n _additional_tags: Mapping[str, str]\n _hook_defs: AbstractSet[HookDefinition]\n _retry_policy: Optional[RetryPolicy]\n _inputs: Mapping[str, "NodeInput"]\n _outputs: Mapping[str, "NodeOutput"]\n\n def __init__(\n self,\n name: str,\n definition: "NodeDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n from .node_definition import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition,\n "graph_definition",\n GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n self._inputs = {\n name: NodeInput(self, input_def)\n for name, input_def in self.definition.input_dict.items()\n }\n self._outputs = {\n name: NodeOutput(self, output_def)\n for name, output_def in self.definition.output_dict.items()\n }\n\n def inputs(self) -> Iterable["NodeInput"]:\n return self._inputs.values()\n\n def outputs(self) -> Iterable["NodeOutput"]:\n return self._outputs.values()\n\n def get_input(self, name: str) -> "NodeInput":\n check.str_param(name, "name")\n return self._inputs[name]\n\n def get_output(self, name: str) -> "NodeOutput":\n check.str_param(name, "name")\n return self._outputs[name]\n\n def has_input(self, name: str) -> bool:\n return self.definition.has_input(name)\n\n def input_def_named(self, name: str) -> InputDefinition:\n return self.definition.input_def_named(name)\n\n def has_output(self, name: str) -> bool:\n return self.definition.has_output(name)\n\n def output_def_named(self, name: str) -> OutputDefinition:\n return self.definition.output_def_named(name)\n\n @property\n def input_dict(self) -> Mapping[str, InputDefinition]:\n return self.definition.input_dict\n\n @property\n def output_dict(self) -> Mapping[str, OutputDefinition]:\n return self.definition.output_dict\n\n @property\n def tags(self) -> Mapping[str, str]:\n return {**self.definition.tags, **self._additional_tags}\n\n def container_maps_input(self, input_name: str) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name: str) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n InputPointer(self.name, input_name)\n )\n if mapping is None:\n check.failed(\n f"container does not map input {input_name}, check container_maps_input first"\n )\n return mapping\n\n def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n if mapping is None:\n check.failed(\n f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n "container_maps_fan_in_input first"\n )\n\n return mapping\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy\n\n @abstractmethod\n def describe_node(self) -> str: ...\n\n @abstractmethod\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]: ...\n\n\nclass GraphNode(Node):\n definition: "GraphDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "GraphDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n\n check.inst_param(definition, "definition", GraphDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for node in self.definition.node_dict.values():\n yield from node.get_resource_requirements(\n asset_layer=asset_layer,\n outer_container=self.definition,\n parent_handle=cur_node_handle,\n )\n\n def describe_node(self) -> str:\n return f"graph '{self.name}'"\n\n\nclass OpNode(Node):\n definition: "OpDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "OpDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .op_definition import OpDefinition\n\n check.inst_param(definition, "definition", OpDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n from .resource_requirement import InputManagerRequirement\n\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for requirement in self.definition.get_resource_requirements(\n (cur_node_handle, asset_layer)\n ):\n # If requirement is a root input manager requirement, but the corresponding node has an upstream output, then ignore the requirement.\n if (\n isinstance(requirement, InputManagerRequirement)\n and outer_container.dependency_structure.has_deps(\n NodeInput(self, self.definition.input_def_named(requirement.input_name))\n )\n and requirement.root_input\n ):\n continue\n yield requirement\n for hook_def in self.hook_defs:\n yield from hook_def.get_resource_requirements(self.describe_node())\n\n def describe_node(self) -> str:\n return f"op '{self.name}'"\n\n\n@whitelist_for_serdes(storage_name="SolidHandle")\nclass NodeHandle(NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])):\n """A structured object to identify nodes in the potentially recursive graph structure."""\n\n def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n return super(NodeHandle, cls).__new__(\n cls,\n check.str_param(name, "name"),\n check.opt_inst_param(parent, "parent", NodeHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def root(self):\n if self.parent:\n return self.parent.root\n else:\n return self\n\n @property\n def path(self) -> Sequence[str]:\n """Return a list representation of the handle.\n\n Inverse of NodeHandle.from_path.\n\n Returns:\n List[str]:\n """\n path: List[str] = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of NodeHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (NodeHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", NodeHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('bar', NodeHandle('foo', None))\n assert handle.pop(ancestor) == NodeHandle('baz', None)\n """\n check.inst_param(ancestor, "ancestor", NodeHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n f"Handle {self.to_string()} does not descend from {ancestor.to_string()}",\n )\n\n return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor: Optional["NodeHandle"]) -> "NodeHandle":\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (NodeHandle): Handle to the new ancestor.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('quux' None)\n assert handle.with_ancestor(ancestor) == NodeHandle(\n 'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n return NodeHandle.from_path([*(ancestor.path if ancestor else []), *self.path])\n\n @staticmethod\n def from_path(path: Sequence[str]) -> "NodeHandle":\n check.sequence_param(path, "path", of_type=str)\n\n cur: Optional["NodeHandle"] = None\n _path = list(path)\n while len(_path) > 0:\n cur = NodeHandle(name=_path.pop(0), parent=cur)\n\n if cur is None:\n check.failed(f"Invalid handle path {path}")\n\n return cur\n\n @staticmethod\n def from_string(handle_str: str) -> "NodeHandle":\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return NodeHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr: Mapping[str, Any]) -> "NodeHandle":\n """This method makes it possible to load a potentially nested NodeHandle after a\n roundtrip through json.loads(json.dumps(NodeHandle._asdict())).\n """\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n parent = NodeHandle.from_dict(\n {\n "name": dict_repr["parent"][0],\n "parent": dict_repr["parent"][1],\n }\n )\n else:\n parent = dict_repr["parent"]\n\n return NodeHandle(name=dict_repr["name"], parent=parent)\n\n\nclass NodeInputHandle(\n NamedTuple("_NodeInputHandle", [("node_handle", NodeHandle), ("input_name", str)])\n):\n """A structured object to uniquely identify inputs in the potentially recursive graph structure."""\n\n\nclass NodeOutputHandle(\n NamedTuple("_NodeOutputHandle", [("node_handle", NodeHandle), ("output_name", str)])\n):\n """A structured object to uniquely identify outputs in the potentially recursive graph structure."""\n\n\nclass NodeInput(NamedTuple("_NodeInput", [("node", Node), ("input_def", InputDefinition)])):\n def __new__(cls, node: Node, input_def: InputDefinition):\n return super(NodeInput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeInput",\n node_name=self.node.name,\n input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.node.name, self.input_def.name))\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, NodeInput)\n and self.node.name == other.node.name\n and self.input_def.name == other.input_def.name\n )\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def input_name(self) -> str:\n return self.input_def.name\n\n\nclass NodeOutput(NamedTuple("_NodeOutput", [("node", Node), ("output_def", OutputDefinition)])):\n def __new__(cls, node: Node, output_def: OutputDefinition):\n return super(NodeOutput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeOutput",\n node_name=self.node.name,\n output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self) -> int:\n return hash((self.node.name, self.output_def.name))\n\n def __eq__(self, other: Any) -> bool:\n return self.node.name == other.node.name and self.output_def.name == other.output_def.name\n\n def describe(self) -> str:\n return f"{self.node_name}:{self.output_def.name}"\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def is_dynamic(self) -> bool:\n return self.output_def.is_dynamic\n\n @property\n def output_name(self) -> str:\n return self.output_def.name\n\n\nclass DependencyType(Enum):\n DIRECT = "DIRECT"\n FAN_IN = "FAN_IN"\n DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC):\n @abstractmethod\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n pass\n\n @abstractmethod\n def is_fan_in(self) -> bool:\n """The result passed to the corresponding input will be a List made from different node outputs."""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(\n cls,\n node: str,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n ):\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return True if the dependency is fan-in (always False for DependencyDefinition)."""\n return False
\n\n def get_op_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [\n (\n "dependencies",\n PublicAttr[Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]],\n )\n ],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.sequence_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.node + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n f'Duplicate dependencies on node "{dep.node}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition."\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed(f"Unexpected dependencies entry {dep}")\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n
[docs] @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]
\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return `True` if the dependency is fan-in (always True for MultiDependencyDefinition)."""\n return True
\n\n
[docs] @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n """Return the combined list of dependencies contained by this object, inculding of :py:class:`DependencyDefinition` and :py:class:`MappedInputPlaceholder` objects."""\n return self.dependencies
\n\n\nclass BlockingAssetChecksDependencyDefinition(\n IDependencyDefinition,\n NamedTuple(\n "_BlockingAssetChecksDependencyDefinition",\n [\n (\n "asset_check_dependencies",\n Sequence[DependencyDefinition],\n ),\n ("other_dependency", Optional[DependencyDefinition]),\n ],\n ),\n):\n """An input that depends on a set of outputs that correspond to upstream asset checks, and also\n optionally depends on a single upstream output that does not correspond to an asset check.\n\n We model this with a different kind of DependencyDefinition than MultiDependencyDefinition,\n because we treat the value that's passed to the input parameter differently: we ignore the asset\n check dependencies and only pass a single value, instead of a fanned-in list.\n """\n\n @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n if self.other_dependency:\n return [*self.asset_check_dependencies, self.other_dependency]\n else:\n return self.asset_check_dependencies\n\n @public\n def is_fan_in(self) -> bool:\n return False\n\n @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n return self.get_node_dependencies()\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("node_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n return [DependencyDefinition(self.node_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputs: TypeAlias = Tuple[\n DependencyType,\n Union[NodeOutput, List[Union[NodeOutput, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputMap: TypeAlias = Dict[NodeInput, DepTypeAndOutputs]\n\n\ndef _create_handle_dict(\n node_dict: Mapping[str, Node],\n dep_dict: DependencyMapping[str],\n) -> InputToOutputMap:\n from .composition import MappedInputPlaceholder\n\n check.mapping_param(node_dict, "node_dict", key_type=str, value_type=Node)\n check.two_dim_mapping_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputMap = {}\n\n for node_name, input_dict in dep_dict.items():\n from_node = node_dict[node_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(\n dep_def, (MultiDependencyDefinition, BlockingAssetChecksDependencyDefinition)\n ):\n handles: List[Union[NodeOutput, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(node_dict[inner_dep.node].get_output(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n f"Unexpected MultiDependencyDefinition dependencies type {inner_dep}"\n )\n\n handle_dict[from_node.get_input(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DIRECT,\n node_dict[dep_def.node].get_output(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n node_dict[dep_def.node_name].get_output(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(\n nodes: Mapping[str, Node], dep_dict: DependencyMapping[str]\n ) -> "DependencyStructure":\n return DependencyStructure(\n list(dep_dict.keys()),\n _create_handle_dict(nodes, dep_dict),\n dep_dict,\n )\n\n _node_input_index: DefaultDict[str, Dict[NodeInput, List[NodeOutput]]]\n _node_output_index: Dict[str, DefaultDict[NodeOutput, List[NodeInput]]]\n _dynamic_fan_out_index: Dict[str, NodeOutput]\n _collect_index: Dict[str, Set[NodeOutput]]\n _deps_by_node_name: DependencyMapping[str]\n\n def __init__(\n self,\n node_names: Sequence[str],\n input_to_output_map: InputToOutputMap,\n deps_by_node_name: DependencyMapping[str],\n ):\n self._node_names = node_names\n self._input_to_output_map = input_to_output_map\n self._deps_by_node_name = deps_by_node_name\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is node\n # count during the GraphQL query in particular\n\n # node_name => input_handle => list[output_handle]\n self._node_input_index = defaultdict(dict)\n\n # node_name => output_handle => list[input_handle]\n self._node_output_index = defaultdict(lambda: defaultdict(list))\n\n # node_name => dynamic output_handle that this node will dupe for\n self._dynamic_fan_out_index = {}\n\n # node_name => set of dynamic output_handle this collects over\n self._collect_index = defaultdict(set)\n\n for node_input, (dep_type, node_output_or_list) in self._input_to_output_map.items():\n if dep_type == DependencyType.FAN_IN:\n node_output_list: List[NodeOutput] = []\n for node_output in node_output_or_list:\n if not isinstance(node_output, NodeOutput):\n continue\n\n if node_output.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on dynamic output"\n f' "{node_output.describe()}".'\n )\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on output"\n f' "{node_output.describe()}", downstream of'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}".'\n )\n\n node_output_list.append(node_output)\n elif dep_type == DependencyType.DIRECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_fan_out(node_input, node_output)\n\n if self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_fan_out(\n node_input, self._dynamic_fan_out_index[node_output.node_name]\n )\n\n node_output_list = [node_output]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_collect(node_input, node_output)\n\n elif self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_collect(\n node_input,\n self._dynamic_fan_out_index[node_output.node_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {node_output} -> {node_input}"\n )\n\n node_output_list = [node_output]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._node_input_index[node_input.node.name][node_input] = node_output_list\n for node_output in node_output_list:\n self._node_output_index[node_output.node.name][node_output].append(node_input)\n\n def _validate_and_set_fan_out(self, node_input: NodeInput, node_output: NodeOutput) -> None:\n """Helper function for populating _dynamic_fan_out_index."""\n if not node_input.node.definition.input_supports_dynamic_output_dep(node_input.input_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of dynamic output"\n f' "{node_output.describe()}" since input "{node_input.input_name}" maps to a'\n " node that is already downstream of another dynamic output. Nodes cannot be"\n " downstream of more than one dynamic output"\n )\n\n if self._collect_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be both downstream of dynamic output "\n f"{node_output.describe()} and collect over dynamic output "\n f"{next(iter(self._collect_index[node_input.node_name])).describe()}."\n )\n\n if self._dynamic_fan_out_index.get(node_input.node_name) is None:\n self._dynamic_fan_out_index[node_input.node_name] = node_output\n return\n\n if self._dynamic_fan_out_index[node_input.node_name] != node_output:\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_input.node_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n node_input: NodeInput,\n node_output: NodeOutput,\n ) -> None:\n if self._dynamic_fan_out_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot both collect over dynamic output "\n f"{node_output.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[node_input.node_name].describe()}."\n )\n\n self._collect_index[node_input.node_name].add(node_output)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}"'\n )\n\n def all_upstream_outputs_from_node(self, node_name: str) -> Sequence[NodeOutput]:\n check.str_param(node_name, "node_name")\n\n # flatten out all outputs that feed into the inputs of this node\n return [\n output_handle\n for output_handle_list in self._node_input_index[node_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeInput, Sequence[NodeOutput]]:\n """Returns a Dict[NodeInput, List[NodeOutput]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[NodeOutput] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(node_name, "node_name")\n return self._node_input_index[node_name]\n\n def output_to_downstream_inputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeOutput, Sequence[NodeInput]]:\n """Returns a Dict[NodeOutput, List[NodeInput]] that\n represents all the downstream inputs for each output in the\n dictionary.\n """\n check.str_param(node_name, "node_name")\n return self._node_output_index[node_name]\n\n def has_direct_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def get_dependency_definition(self, node_input: NodeInput) -> Optional[IDependencyDefinition]:\n return self._deps_by_node_name[node_input.node_name].get(node_input.input_name)\n\n def has_fan_in_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, node_input: NodeInput\n ) -> Sequence[Union[NodeOutput, Type["MappedInputPlaceholder"]]]:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, deps = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[NodeOutput, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def has_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n return node_input in self._input_to_output_map\n\n def get_deps_list(self, node_input: NodeInput) -> Sequence[NodeOutput]:\n check.inst_param(node_input, "node_input", NodeInput)\n check.invariant(self.has_deps(node_input))\n dep_type, handle_or_list = self._input_to_output_map[node_input]\n if dep_type == DependencyType.DIRECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, NodeOutput)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def inputs(self) -> Sequence[NodeInput]:\n return list(self._input_to_output_map.keys())\n\n def get_upstream_dynamic_output_for_node(self, node_name: str) -> Optional[NodeOutput]:\n return self._dynamic_fan_out_index.get(node_name)\n\n def get_dependency_type(self, node_input: NodeInput) -> Optional[DependencyType]:\n result = self._input_to_output_map.get(node_input)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, node_name: str) -> bool:\n return node_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, node_name: str) -> bool:\n for node_output in self._dynamic_fan_out_index.values():\n if node_output.node_name == node_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/definitions/dependency", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.dependency"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.events

\nimport re\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, experimental_param, public\nfrom dagster._core.definitions.data_version import DATA_VERSION_TAG, DataVersion\nfrom dagster._core.storage.tags import MULTIDIMENSIONAL_PARTITION_PREFIX, SYSTEM_TAG_PREFIX\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\n\nfrom .metadata import (\n    MetadataFieldSerializer,\n    MetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n    from dagster._core.execution.context.output import OutputContext\n\n\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_DELIMITER = "/"\n\n\ndef parse_asset_key_string(s: str) -> Sequence[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", PublicAttr[Sequence[str]])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return f"AssetKey({self.path})"\n\n def __repr__(self):\n return f"AssetKey({self.path})"\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n if len(self.path) != len(other.path):\n return False\n for i in range(0, len(self.path)):\n if self.path[i] != other.path[i]:\n return False\n return True\n\n def to_string(self) -> str:\n """E.g. '["first_component", "second_component"]'."""\n return seven.json.dumps(self.path)\n\n def to_user_string(self) -> str:\n """E.g. "first_component/second_component"."""\n return ASSET_KEY_DELIMITER.join(self.path)\n\n def to_python_identifier(self, suffix: Optional[str] = None) -> str:\n """Build a valid Python identifier based on the asset key that can be used for\n operation names or I/O manager keys.\n """\n path = list(self.path)\n\n if suffix is not None:\n path.append(suffix)\n\n return "__".join(path).replace("-", "_")\n\n @staticmethod\n def from_user_string(asset_key_string: str) -> "AssetKey":\n return AssetKey(asset_key_string.split(ASSET_KEY_DELIMITER))\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: Sequence[str]):\n check.sequence_param(path, "path", of_type=str)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(graphql_input_asset_key: Mapping[str, Sequence[str]]) -> "AssetKey":\n return AssetKey(graphql_input_asset_key["path"])\n\n def to_graphql_input(self) -> Mapping[str, Sequence[str]]:\n return {"path": self.path}\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetKey") -> "AssetKey":\n if isinstance(arg, AssetKey):\n return check.inst_param(arg, "arg", AssetKey)\n elif isinstance(arg, str):\n return AssetKey([arg])\n elif isinstance(arg, list):\n check.list_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n elif isinstance(arg, tuple):\n check.tuple_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n else:\n check.failed(f"Unexpected type for AssetKey: {type(arg)}")\n\n @staticmethod\n def from_coercible_or_definition(\n arg: Union["CoercibleToAssetKey", "AssetsDefinition", "SourceAsset"]\n ) -> "AssetKey":\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n if isinstance(arg, AssetsDefinition):\n return arg.key\n elif isinstance(arg, SourceAsset):\n return arg.key\n else:\n return AssetKey.from_coercible(arg)\n\n # @staticmethod\n # def from_coercible_to_asset_dep(arg: "CoercibleToAssetDep") -> "AssetKey":\n # from dagster._core.definitions.asset_dep import AssetDep\n # from dagster._core.definitions.asset_spec import AssetSpec\n # from dagster._core.definitions.assets import AssetsDefinition\n # from dagster._core.definitions.source_asset import SourceAsset\n\n # if isinstance(arg, AssetsDefinition):\n # if len(arg.keys) > 1:\n # # Only AssetsDefinition with a single asset can be passed\n # raise DagsterInvalidDefinitionError(\n # "Cannot pass a multi_asset AssetsDefinition as an argument to deps."\n # " Instead, specify dependencies on the assets created by the multi_asset"\n # f" via AssetKeys or strings. For the multi_asset {arg.node_def.name}, the"\n # f" available keys are: {arg.keys}."\n # )\n # return arg.key\n # elif isinstance(arg, SourceAsset):\n # return arg.key\n # elif isinstance(arg, AssetDep):\n # return arg.asset_key\n # elif isinstance(arg, AssetSpec):\n # return arg.asset_key\n # else:\n # return AssetKey.from_coercible(arg)\n\n def has_prefix(self, prefix: Sequence[str]) -> bool:\n return len(self.path) >= len(prefix) and self.path[: len(prefix)] == prefix\n\n def with_prefix(self, prefix: "CoercibleToAssetKeyPrefix") -> "AssetKey":\n prefix = key_prefix_from_coercible(prefix)\n return AssetKey(list(prefix) + list(self.path))
\n\n\nclass AssetKeyPartitionKey(NamedTuple):\n """An AssetKey with an (optional) partition key. Refers either to a non-partitioned asset or a\n partition of a partitioned asset.\n """\n\n asset_key: AssetKey\n partition_key: Optional[str] = None\n\n\nCoercibleToAssetKey = Union[AssetKey, str, Sequence[str]]\nCoercibleToAssetKeyPrefix = Union[str, Sequence[str]]\n\n\ndef check_opt_coercible_to_asset_key_prefix_param(\n prefix: Optional[CoercibleToAssetKeyPrefix], param_name: str\n) -> Optional[Sequence[str]]:\n try:\n return key_prefix_from_coercible(prefix) if prefix is not None else None\n except check.CheckError:\n raise check.ParameterCheckError(\n f'Param "{param_name}" is not a string or a sequence of strings'\n )\n\n\ndef key_prefix_from_coercible(key_prefix: CoercibleToAssetKeyPrefix) -> Sequence[str]:\n if isinstance(key_prefix, str):\n return [key_prefix]\n elif isinstance(key_prefix, list):\n return key_prefix\n else:\n check.failed(f"Unexpected type for key_prefix: {type(key_prefix)}")\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key: AssetKey, partitions: Optional[AbstractSet[str]] = None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\nT = TypeVar("T")\n\n\n
[docs]@experimental_param(param="data_version")\nclass Output(Generic[T]):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n data_version (Optional[DataVersion]): (Experimental) A data version to manually set\n for the asset.\n """\n\n def __init__(\n self,\n value: T,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n self._value = value\n self._output_name = check.str_param(output_name, "output_name")\n self._data_version = check.opt_inst_param(data_version, "data_version", DataVersion)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n return self._metadata\n\n @public\n @property\n def value(self) -> Any:\n """Any: The value returned by the compute function."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """str: Name of the corresponding :py:class:`Out`."""\n return self._output_name\n\n @public\n @property\n def data_version(self) -> Optional[DataVersion]:\n """Optional[DataVersion]: A data version that was manually set on the `Output`."""\n return self._data_version\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, Output)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.metadata == other.metadata\n )
\n\n\n
[docs]class DynamicOutput(Generic[T]):\n """Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n self._mapping_key = check_valid_name(check.str_param(mapping_key, "mapping_key"))\n self._output_name = check.str_param(output_name, "output_name")\n self._value = value\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> str:\n """The mapping_key that was set for this DynamicOutput at instantiation."""\n return self._mapping_key\n\n @public\n @property\n def value(self) -> T:\n """The value that is returned by the compute function for this DynamicOut."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """Name of the :py:class:`DynamicOut` defined on the op that this DynamicOut is associated with."""\n return self._output_name\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DynamicOutput)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.mapping_key == other.mapping_key\n and self.metadata == other.metadata\n )
\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n observation. Users should not pass values into this argument.\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n if any([not tag.startswith(SYSTEM_TAG_PREFIX) for tag in tags or {}]):\n check.failed(\n "Users should not pass values into the tags argument for AssetMaterializations. "\n "The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n @property\n def data_version(self) -> Optional[str]:\n return self.tags.get(DATA_VERSION_TAG)\n\n\nUNDEFINED_ASSET_KEY_PATH = ["__undefined__"]\n\n\nclass AssetMaterializationSerializer(NamedTupleSerializer):\n # There are old `Materialization` objects in storage. We set the default value for asset key to\n # be `AssetKey(["__undefined__"])` to ensure that we can load these objects, without needing to\n # allow for the construction of new `AssetMaterialization` objects with no defined AssetKey.\n def before_unpack(self, context, unpacked_dict: Any) -> Any:\n # cover both the case where "asset_key" is not present at all and where it is None\n if unpacked_dict.get("asset_key") is None:\n unpacked_dict["asset_key"] = AssetKey(UNDEFINED_ASSET_KEY_PATH)\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n old_storage_names={"Materialization"},\n serializer=AssetMaterializationSerializer,\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", Optional[Mapping[str, str]]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in the Dagster UI.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across\n job runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n partition (Optional[str]): The name of the partition\n that was materialized.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n materialization. Users should not pass values into this argument.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionKey\n\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n invalid_tags = [tag for tag in tags or {} if not tag.startswith(SYSTEM_TAG_PREFIX)]\n if len(invalid_tags) > 0:\n check.failed(\n f"Invalid tags: {tags} Users should not pass values into the tags argument for"\n " AssetMaterializations. The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n partition = check.opt_str_param(partition, "partition")\n\n if not isinstance(partition, MultiPartitionKey):\n # When event log records are unpacked from storage, cast the partition key as a\n # MultiPartitionKey if multi-dimensional partition tags exist\n multi_dimensional_partitions = {\n dimension[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]: partition_key\n for dimension, partition_key in (tags or {}).items()\n if dimension.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX)\n }\n if multi_dimensional_partitions:\n partition = MultiPartitionKey(multi_dimensional_partitions)\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=partition,\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @public\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, Sequence[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata={"path": MetadataValue.path(path)},\n )
\n\n\n
[docs]@deprecated(\n breaking_version="1.7",\n additional_warn_text="Please use AssetCheckResult and @asset_check instead.",\n)\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", PublicAttr[bool]),\n ("label", PublicAttr[Optional[str]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\n@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", PublicAttr[bool]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Op compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n allow_retries (Optional[bool]):\n Whether this Failure should respect the retry policy or bypass it and immediately fail.\n Defaults to True, respecting the retry policy and allowing retries.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n allow_retries: Optional[bool] = None,\n ):\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n self.allow_retries = check.opt_bool_param(allow_retries, "allow_retries", True)
\n\n\n
[docs]class RetryRequested(Exception):\n """An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/_core/definitions/events", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, Sequence, Union, overload\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import Int\nfrom dagster._config import Field, Noneable, Selector, UserConfigSchema\nfrom dagster._core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.executor.base import Executor\n    from dagster._core.executor.in_process import InProcessExecutor\n    from dagster._core.executor.init import InitExecutorContext\n    from dagster._core.executor.multiprocess import MultiprocessExecutor\n    from dagster._core.instance import DagsterInstance\n\n\nclass ExecutorRequirement(PyEnum):\n    """An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job execution.\n    """\n\n    # The passed in IJob must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = (  # This needs to still exist for folks who may have written their own executor\n        "RECONSTRUCTABLE_PIPELINE"\n    )\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any op outputs on the job must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements() -> Sequence[ExecutorRequirement]:\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\nExecutorConfig = Mapping[str, object]\nExecutorCreationFunction: TypeAlias = Callable[["InitExecutorContext"], "Executor"]\nExecutorRequirementsFunction: TypeAlias = Callable[[ExecutorConfig], Sequence[ExecutorRequirement]]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """An executor is responsible for executing the steps of a job.\n\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n description (Optional[str]): A description of the executor.\n """\n\n def __init__(\n self,\n name: str,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Union[\n ExecutorRequirementsFunction, Optional[Sequence[ExecutorRequirement]]\n ] = None,\n executor_creation_fn: Optional[ExecutorCreationFunction] = None,\n description: Optional[str] = None,\n ):\n self._name = check.str_param(name, "name")\n self._requirements_fn: ExecutorRequirementsFunction\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @public\n @property\n def name(self) -> str:\n """Name of the executor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Description of executor, if provided."""\n return self._description\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n def get_requirements(\n self, executor_config: Mapping[str, object]\n ) -> Sequence[ExecutorRequirement]:\n return self._requirements_fn(executor_config)\n\n @public\n @property\n def executor_creation_fn(self) -> Optional[ExecutorCreationFunction]:\n """Callable that takes an :py:class:`InitExecutorContext` and returns an instance of\n :py:class:`Executor`.\n """\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema) -> "ExecutorDefinition":\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema, # type: ignore\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n @staticmethod\n def hardcoded_executor(executor: "Executor"):\n return ExecutorDefinition(\n # Executor name was only relevant in the pipeline/solid/mode world, so we\n # can put a dummy value\n name="__executor__",\n executor_creation_fn=lambda _init_context: executor,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] @public\n def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n description: Optional[str] = None,\n ) -> Self:\n """Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Using ``configured`` may result in config values being displayed in\n the Dagster UI, so it is not recommended to use this API with sensitive values,\n such as secrets.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(name or self.name, description, new_config_schema)
\n\n\n@overload\ndef executor(name: ExecutorCreationFunction) -> ExecutorDefinition: ...\n\n\n@overload\ndef executor(\n name: Optional[str] = ...,\n config_schema: Optional[UserConfigSchema] = ...,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = ...,\n) -> "_ExecutorDecoratorCallable": ...\n\n\n
[docs]def executor(\n name: Union[ExecutorCreationFunction, Optional[str]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = None,\n) -> Union[ExecutorDefinition, "_ExecutorDecoratorCallable"]:\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn: ExecutorCreationFunction) -> ExecutorDefinition:\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(executor_def, wrapped=fn) # type: ignore\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: ExecutorConfig) -> "InProcessExecutor":\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore # (possible none)\n marker_to_close=config.get("marker_to_close"), # type: ignore # (should be str)\n )\n\n\nIN_PROC_CONFIG = Field(\n {\n "retries": get_retries_config(),\n "marker_to_close": Field(\n str,\n is_required=False,\n description="[DEPRECATED]",\n ),\n },\n description="Execute all steps in a single process.",\n)\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n To select it, include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_) -> "InProcessExecutor":\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: ExecutorConfig) -> "MultiprocessExecutor":\n from dagster._core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg: Dict[str, object] = {}\n start_selector = check.opt_dict_elem(config, "start_method")\n if start_selector:\n start_method, start_cfg = next(iter(start_selector.items()))\n\n return MultiprocessExecutor(\n max_concurrent=check.opt_int_elem(config, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(config, "tag_concurrency_limits"),\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore\n start_method=start_method,\n explicit_forkserver_preload=check.opt_list_elem(start_cfg, "preload_modules", of_type=str),\n )\n\n\nMULTI_PROC_CONFIG = Field(\n {\n "max_concurrent": Field(\n Noneable(Int),\n default_value=None,\n description=(\n "The number of processes that may run concurrently. "\n "By default, this is set to be the return value of `multiprocessing.cpu_count()`."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n "start_method": Field(\n Selector(\n fields={\n "spawn": Field(\n {},\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `spawn`."\n ),\n ),\n "forkserver": Field(\n {\n "preload_modules": Field(\n [str],\n is_required=False,\n description=(\n "Explicitly specify the modules to preload in the forkserver."\n " Otherwise, there are two cases for default values if modules"\n " are not specified. If the Dagster job was loaded from a"\n " module, the same module will be preloaded. If not, the"\n " `dagster` module is preloaded."\n ),\n ),\n },\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `forkserver`."\n ),\n ),\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. By default, `spawn` is selected. See "\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods."\n ),\n ),\n "retries": get_retries_config(),\n },\n description="Execute each step in an individual process.",\n)\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n To configure the multiprocess executor, include a fragment such as the following in your run\n config:\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be None or 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndef check_cross_process_constraints(init_context: "InitExecutorContext") -> None:\n from dagster._core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_job(init_context.job)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_job(job: IJob) -> None:\n if not isinstance(job, ReconstructableJob):\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with the job"\n f' "{job.get_definition().name}" that is not reconstructable. Job must be loaded in a'\n " way that allows dagster to reconstruct them in a new process. This means: \\n *"\n " using the file, module, or workspace.yaml arguments of"\n " dagster-webserver/dagster-graphql/dagster\\n * loading the job through the"\n " reconstructable() function\\n"\n )\n\n\ndef _check_non_ephemeral_instance(instance: "DagsterInstance") -> None:\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an ephemeral"\n " DagsterInstance. A non-ephemeral instance is needed to coordinate execution between"\n " multiple processes. You can configure your default instance via $DAGSTER_HOME or"\n " ensure a valid one is passed when invoking the python APIs. You can learn more about"\n " setting up a persistent DagsterInstance from the DagsterInstance docs here:"\n " https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(\n executor_config: ExecutorConfig,\n) -> Sequence[ExecutorRequirement]:\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n
[docs]@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context: "InitExecutorContext") -> "Executor":\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(\n check.dict_elem(init_context.executor_config, "multiprocess")\n )\n else:\n return _core_in_process_executor_creation(\n check.dict_elem(init_context.executor_config, "in_process")\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/executor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.executor_definition"}, "freshness_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy

\nimport datetime\nfrom typing import AbstractSet, NamedTuple, Optional\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.schedules import (\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom .events import AssetKey\n\n\nclass FreshnessConstraint(NamedTuple):\n    asset_keys: AbstractSet[AssetKey]\n    required_data_time: datetime.datetime\n    required_by_time: datetime.datetime\n\n\nclass FreshnessMinutes(NamedTuple):\n    overdue_minutes: float\n    lag_minutes: float\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass FreshnessPolicy(\n NamedTuple(\n "_FreshnessPolicy",\n [\n ("maximum_lag_minutes", float),\n ("cron_schedule", Optional[str]),\n ("cron_schedule_timezone", Optional[str]),\n ],\n )\n):\n """A FreshnessPolicy specifies how up-to-date you want a given asset to be.\n\n Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\n that you expect to be incorporated into the current state of that asset at certain points in time.\n How this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n (other partitioning schemes are not supported).\n\n For time-partitioned assets, the current data time for the asset is simple to calculate. The\n upstream data that is incorporated into the asset is exactly the set of materialized partitions\n for that asset. Thus, the current data time for the asset is simply the time up to which all\n partitions have been materialized.\n\n For unpartitioned assets, the current data time is based on the upstream materialization records\n that were read to generate the current state of the asset. More specifically,\n imagine you have two assets, where A depends on B. If `B` has a FreshnessPolicy defined, this\n means that at time T, the most recent materialization of `B` should have come after a\n materialization of `A` which was no more than `maximum_lag_minutes` ago. This calculation is\n recursive: any given asset is expected to incorporate up-to-date data from all of its upstream\n assets.\n\n It is assumed that all asset definitions with no upstream asset definitions consume from some\n always-updating source. That is, if you materialize that asset at time T, it will incorporate\n all data up to time T.\n\n If `cron_schedule` is not defined, the given asset will be expected to incorporate upstream\n data from no more than `maximum_lag_minutes` ago at all points in time. For example, "The events\n table should always have data from at most 1 hour ago".\n\n If `cron_schedule` is defined, the given asset will be expected to incorporate upstream data\n from no more than `maximum_lag_minutes` ago at each cron schedule tick. For example, "By 9AM,\n the signups table should contain all of yesterday's data".\n\n The freshness status of assets with policies defined will be visible in the UI. If you are using\n an asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\n date with respect to their FreshnessPolicy.\n\n Args:\n maximum_lag_minutes (float): An upper bound for how old the data contained within this\n asset may be.\n cron_schedule (Optional[str]): A cron schedule string (e.g. ``"0 1 * * *"``) specifying a\n series of times by which the `maximum_lag_minutes` constraint must be satisfied. If\n no cron schedule is provided, then this constraint must be satisfied at all times.\n cron_schedule_timezone (Optional[str]): Timezone in which the cron schedule should be evaluated.\n If not specified, defaults to UTC. Supported strings for timezones are the ones provided\n by the `IANA time zone database <https://www.iana.org/time-zones>` - e.g.\n "America/Los_Angeles".\n\n .. code-block:: python\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def fresh_asset():\n ...\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def cron_up_to_date_asset():\n ...\n\n """\n\n def __new__(\n cls,\n *,\n maximum_lag_minutes: float,\n cron_schedule: Optional[str] = None,\n cron_schedule_timezone: Optional[str] = None,\n ):\n if cron_schedule is not None:\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(f"Invalid cron schedule '{cron_schedule}'.")\n check.param_invariant(\n is_valid_cron_schedule(cron_schedule),\n "cron_schedule",\n f"Invalid cron schedule '{cron_schedule}'.",\n )\n if cron_schedule_timezone is not None:\n check.param_invariant(\n cron_schedule is not None,\n "cron_schedule_timezone",\n "Cannot specify cron_schedule_timezone without a cron_schedule.",\n )\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(cron_schedule_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n "Invalid cron schedule timezone '{cron_schedule_timezone}'. "\n ) from e\n return super(FreshnessPolicy, cls).__new__(\n cls,\n maximum_lag_minutes=float(\n check.numeric_param(maximum_lag_minutes, "maximum_lag_minutes")\n ),\n cron_schedule=check.opt_str_param(cron_schedule, "cron_schedule"),\n cron_schedule_timezone=check.opt_str_param(\n cron_schedule_timezone, "cron_schedule_timezone"\n ),\n )\n\n @classmethod\n def _create(cls, *args):\n """Pickle requires a method with positional arguments to construct\n instances of a class. Since the constructor for this class has\n keyword arguments only, we define this method to be used by pickle.\n """\n return cls(maximum_lag_minutes=args[0], cron_schedule=args[1])\n\n def __reduce__(self):\n return (self._create, (self.maximum_lag_minutes, self.cron_schedule))\n\n @property\n def maximum_lag_delta(self) -> datetime.timedelta:\n return datetime.timedelta(minutes=self.maximum_lag_minutes)\n\n def get_evaluation_tick(\n self,\n evaluation_time: datetime.datetime,\n ) -> Optional[datetime.datetime]:\n if self.cron_schedule:\n # most recent cron schedule tick\n schedule_ticks = reverse_cron_string_iterator(\n end_timestamp=evaluation_time.timestamp(),\n cron_string=self.cron_schedule,\n execution_timezone=self.cron_schedule_timezone,\n )\n return next(schedule_ticks)\n else:\n return evaluation_time\n\n def minutes_overdue(\n self,\n data_time: Optional[datetime.datetime],\n evaluation_time: datetime.datetime,\n ) -> Optional[FreshnessMinutes]:\n """Returns a number of minutes past the specified freshness policy that this asset currently\n is. If the asset is missing upstream data, or is not materialized at all, then it is unknown\n how overdue it is, and this will return None.\n\n Args:\n data_time (Optional[datetime]): The timestamp of the data that was used to create the\n current version of this asset.\n evaluation_time (datetime): The time at which we're evaluating the overdueness of this\n asset. Generally, this is the current time.\n """\n if data_time is None:\n return None\n evaluation_tick = self.get_evaluation_tick(evaluation_time)\n if evaluation_tick is None:\n return None\n required_time = evaluation_tick - self.maximum_lag_delta\n\n return FreshnessMinutes(\n lag_minutes=max(0.0, (evaluation_tick - data_time).total_seconds() / 60),\n overdue_minutes=max(0.0, (required_time - data_time).total_seconds() / 60),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy"}, "freshness_policy_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy_sensor_definition

\nfrom typing import Callable, Dict, Mapping, NamedTuple, Optional, Set, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    FreshnessPolicySensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\n\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\n\n\n@whitelist_for_serdes\nclass FreshnessPolicySensorCursor(\n    NamedTuple(\n        "_FreshnessPolicySensorCursor",\n        [("minutes_late_by_key_str", Mapping[str, Optional[float]])],\n    )\n):\n    def __new__(cls, minutes_late_by_key_str: Mapping[str, Optional[float]]):\n        return super(FreshnessPolicySensorCursor, cls).__new__(\n            cls,\n            minutes_late_by_key_str=check.mapping_param(\n                minutes_late_by_key_str, "minutes_late_by_key_str", key_type=str\n            ),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            deserialize_value(json_str, FreshnessPolicySensorCursor)\n            return True\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    @staticmethod\n    def from_dict(\n        minutes_late_by_key: Mapping[AssetKey, Optional[float]]\n    ) -> "FreshnessPolicySensorCursor":\n        return FreshnessPolicySensorCursor(\n            minutes_late_by_key_str={k.to_user_string(): v for k, v in minutes_late_by_key.items()}\n        )\n\n    @property\n    def minutes_late_by_key(self) -> Mapping[AssetKey, Optional[float]]:\n        return {AssetKey.from_user_string(k): v for k, v in self.minutes_late_by_key_str.items()}\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "FreshnessPolicySensorCursor":\n        return deserialize_value(json_str, FreshnessPolicySensorCursor)\n\n\n
[docs]class FreshnessPolicySensorContext(\n NamedTuple(\n "_FreshnessPolicySensorContext",\n [\n ("sensor_name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("freshness_policy", PublicAttr[FreshnessPolicy]),\n ("minutes_overdue", PublicAttr[Optional[float]]),\n ("previous_minutes_overdue", PublicAttr[Optional[float]]),\n ("instance", PublicAttr[DagsterInstance]),\n ("resources", Resources),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``freshness_policy_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n asset_key (AssetKey): the key of the asset being monitored\n freshness_policy (FreshnessPolicy): the freshness policy of the asset being monitored\n minutes_overdue (Optional[float])\n previous_minutes_overdue (Optional[float]): the minutes_overdue value for this asset on the\n previous sensor tick.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(\n cls,\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float],\n instance: DagsterInstance,\n resources: Optional[Resources] = None,\n ):\n minutes_overdue = check.opt_numeric_param(minutes_overdue, "minutes_overdue")\n previous_minutes_overdue = check.opt_numeric_param(\n previous_minutes_overdue, "previous_minutes_overdue"\n )\n return super(FreshnessPolicySensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n freshness_policy=check.inst_param(freshness_policy, "FreshnessPolicy", FreshnessPolicy),\n minutes_overdue=float(minutes_overdue) if minutes_overdue is not None else None,\n previous_minutes_overdue=(\n float(previous_minutes_overdue) if previous_minutes_overdue is not None else None\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n resources=resources or ScopedResourcesBuilder.build_empty(),\n )
\n\n\n
[docs]@experimental\ndef build_freshness_policy_sensor_context(\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float] = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Resources] = None,\n) -> FreshnessPolicySensorContext:\n """Builds freshness policy sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@freshness_policy_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n asset_key (AssetKey): The AssetKey for the monitored asset\n freshness_policy (FreshnessPolicy): The FreshnessPolicy for the monitored asset\n minutes_overdue (Optional[float]): How overdue the monitored asset currently is\n previous_minutes_overdue (Optional[float]): How overdue the monitored asset was on the\n previous tick.\n instance (DagsterInstance): The dagster instance configured for the context.\n\n Examples:\n .. code-block:: python\n\n context = build_freshness_policy_sensor_context(\n sensor_name="freshness_policy_sensor_to_invoke",\n asset_key=AssetKey("some_asset"),\n freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n minutes_overdue=10.0,\n )\n freshness_policy_sensor_to_invoke(context)\n """\n return FreshnessPolicySensorContext(\n sensor_name=sensor_name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_overdue,\n previous_minutes_overdue=previous_minutes_overdue,\n instance=instance or DagsterInstance.ephemeral(),\n resources=resources,\n )
\n\n\n
[docs]class FreshnessPolicySensorDefinition(SensorDefinition):\n """Define a sensor that reacts to the status of a given set of asset freshness policies,\n where the decorated function will be evaluated on every sensor tick.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_selection: AssetSelection,\n freshness_policy_sensor_fn: Callable[..., None],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n check.str_param(name, "name")\n check.inst_param(asset_selection, "asset_selection", AssetSelection)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._freshness_policy_sensor_fn = check.callable_param(\n freshness_policy_sensor_fn, "freshness_policy_sensor_fn"\n )\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(freshness_policy_sensor_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrapped_fn(context: SensorEvaluationContext):\n from dagster._utils.caching_instance_queryer import (\n CachingInstanceQueryer, # expensive import\n )\n\n if context.repository_def is None:\n raise DagsterInvalidInvocationError(\n "The `repository_def` property on the `SensorEvaluationContext` passed into a "\n "`FreshnessPolicySensorDefinition` must not be None."\n )\n\n if context.cursor is None or not FreshnessPolicySensorCursor.is_valid(context.cursor):\n new_cursor = FreshnessPolicySensorCursor({})\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initializing {name}.")\n return\n\n evaluation_time = pendulum.now("UTC")\n asset_graph = context.repository_def.asset_graph\n instance_queryer = CachingInstanceQueryer(\n context.instance, asset_graph, evaluation_time\n )\n data_time_resolver = CachingDataTimeResolver(instance_queryer=instance_queryer)\n monitored_keys = asset_selection.resolve(asset_graph)\n\n # get the previous status from the cursor\n previous_minutes_late_by_key = FreshnessPolicySensorCursor.from_json(\n context.cursor\n ).minutes_late_by_key\n\n minutes_late_by_key: Dict[AssetKey, Optional[float]] = {}\n for asset_key in monitored_keys:\n freshness_policy = asset_graph.freshness_policies_by_key.get(asset_key)\n if freshness_policy is None:\n continue\n\n # get the current minutes_overdue value for this asset\n result = data_time_resolver.get_minutes_overdue(\n evaluation_time=evaluation_time,\n asset_key=asset_key,\n )\n minutes_late_by_key[asset_key] = result.overdue_minutes if result else None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n context_param_name = get_context_param_name(freshness_policy_sensor_fn)\n freshness_context = FreshnessPolicySensorContext(\n sensor_name=name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_late_by_key[asset_key],\n previous_minutes_overdue=previous_minutes_late_by_key.get(asset_key),\n instance=context.instance,\n resources=context.resources,\n )\n\n with user_code_error_boundary(\n FreshnessPolicySensorExecutionError,\n lambda: f'Error occurred during the execution of sensor "{name}".',\n ):\n context_param = (\n {context_param_name: freshness_context} if context_param_name else {}\n )\n result = freshness_policy_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is not None:\n raise DagsterInvalidDefinitionError(\n "Functions decorated by `@freshness_policy_sensor` may not return or yield"\n " a value."\n )\n\n context.update_cursor(\n FreshnessPolicySensorCursor.from_dict(minutes_late_by_key).to_json()\n )\n\n super(FreshnessPolicySensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> None:\n context_param_name = get_context_param_name(self._freshness_policy_sensor_fn)\n\n sensor_context = get_sensor_context_from_args_or_kwargs(\n self._freshness_policy_sensor_fn,\n args,\n kwargs,\n context_type=FreshnessPolicySensorContext,\n )\n context_param = (\n {context_param_name: sensor_context} if context_param_name and sensor_context else {}\n )\n\n resources = validate_and_get_resource_dict(\n sensor_context.resources if sensor_context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n return self._freshness_policy_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.FRESHNESS_POLICY
\n\n\n
[docs]@experimental\ndef freshness_policy_sensor(\n asset_selection: AssetSelection,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[Callable[..., None]], FreshnessPolicySensorDefinition,]:\n """Define a sensor that reacts to the status of a given set of asset freshness policies, where the\n decorated function will be evaluated on every tick for each asset in the selection that has a\n FreshnessPolicy defined.\n\n Note: returning or yielding a value from the annotated function will result in an error.\n\n Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n\n Args:\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def inner(fn: Callable[..., None]) -> FreshnessPolicySensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n return FreshnessPolicySensorDefinition(\n name=sensor_name,\n freshness_policy_sensor_fn=fn,\n asset_selection=asset_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy_sensor_definition"}, "graph_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.graph_definition

\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.selector.subset_selector import AssetSelectionData\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\n\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    GraphNode,\n    Node,\n    NodeHandle,\n    NodeInput,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import RawMetadataValue\nfrom .node_container import create_execution_structure, normalize_dependency_dict\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .resource_requirement import ResourceRequirement\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.instance import DagsterInstance\n\n    from .asset_layer import AssetLayer\n    from .composition import PendingNodeInvocation\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .op_definition import OpDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .run_config import RunConfig\n    from .source_asset import SourceAsset\n\nT = TypeVar("T")\n\n\ndef _check_node_defs_arg(\n    graph_name: str, node_defs: Optional[Sequence[NodeDefinition]]\n) -> Sequence[NodeDefinition]:\n    node_defs = node_defs or []\n\n    _node_defs = check.opt_sequence_param(node_defs, "node_defs")\n    for node_def in _node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(name=graph_name, func=node_def.__name__)\n            )\n        else:\n            raise DagsterInvalidDefinitionError(f"Invalid item in node list: {node_def!r}")\n\n    return node_defs\n\n\ndef create_adjacency_lists(\n    nodes: Sequence[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Mapping[str, Set[str]], Mapping[str, Set[str]]]:\n    visit_dict = {s.name: False for s in nodes}\n    forward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n    backward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n\n    def visit(node_name: str) -> None:\n        if visit_dict[node_name]:\n            return\n\n        visit_dict[node_name] = True\n\n        for node_output in dep_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = node_output.node.name\n            backward_node = node_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in nodes:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster op graph.\n\n An op graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the job.\n node_defs (Optional[Sequence[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n _node_defs: Sequence[NodeDefinition]\n _dagster_type_dict: Mapping[str, DagsterType]\n _dependencies: DependencyMapping[NodeInvocation]\n _dependency_structure: DependencyStructure\n _node_dict: Mapping[str, Node]\n _input_mappings: Sequence[InputMapping]\n _output_mappings: Sequence[OutputMapping]\n _config_mapping: Optional[ConfigMapping]\n _nodes_in_topological_order: Sequence[Node]\n\n # (node name within the graph -> (input name -> SourceAsset to load that input from))\n # Does NOT include keys for:\n # - Inputs to the graph itself\n # - Inputs to nodes within sub-graphs of the graph\n _node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]]\n\n def __init__(\n self,\n name: str,\n *,\n description: Optional[str] = None,\n node_defs: Optional[Sequence[NodeDefinition]] = None,\n dependencies: Optional[\n Union[DependencyMapping[str], DependencyMapping[NodeInvocation]]\n ] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n **kwargs: Any,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n\n # `dependencies` will be converted to `dependency_structure` and `node_dict`, which may\n # alternatively be passed directly (useful when copying)\n self._dependencies = normalize_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # Sequence[InputMapping]\n self._input_mappings = check.opt_sequence_param(input_mappings, "input_mappings")\n input_defs = _validate_in_mappings(\n self._input_mappings,\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n # Sequence[OutputMapping]\n self._output_mappings, output_defs = _validate_out_mappings(\n check.opt_sequence_param(output_mappings, "output_mappings"),\n self._node_dict,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self._nodes_in_topological_order = self._get_nodes_in_topological_order()\n self._dagster_type_dict = construct_dagster_type_dictionary([self])\n self._node_input_source_assets = check.opt_mapping_param(\n node_input_source_assets, "node_input_source_assets", key_type=str, value_type=dict\n )\n\n def _get_nodes_in_topological_order(self) -> Sequence[Node]:\n _forward_edges, backward_edges = create_adjacency_lists(\n self.nodes, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.node_named(node_name) for node_name in order]\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n unresolveable_input_defs: List[InputDefinition] = []\n for node in self.node_dict.values():\n cur_handle = NodeHandle(node.name, handle)\n for input_def in node.definition.get_inputs_must_be_resolved_top_level(\n asset_layer, cur_handle\n ):\n if self.dependency_structure.has_deps(NodeInput(node, input_def)):\n continue\n elif not node.container_maps_input(input_def.name):\n raise DagsterInvalidDefinitionError(\n f"Input '{input_def.name}' of {node.describe_node()} "\n "has no way of being resolved. Must provide a resolution to this "\n "input via another op/graph, or via a direct input value mapped from the "\n "top-level graph. To "\n "learn more, see the docs for unconnected inputs: "\n "https://docs.dagster.io/concepts/io-management/unconnected-inputs#unconnected-inputs."\n )\n else:\n mapped_input = node.container_mapped_input(input_def.name)\n unresolveable_input_defs.append(mapped_input.get_definition())\n return unresolveable_input_defs\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def nodes(self) -> Sequence[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Mapping[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> Sequence[NodeDefinition]:\n return self._node_defs\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._nodes_in_topological_order\n\n @property\n def node_input_source_assets(self) -> Mapping[str, Mapping[str, "SourceAsset"]]:\n return self._node_input_source_assets\n\n def has_node_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def node_named(self, name: str) -> Node:\n check.str_param(name, "name")\n if name not in self._node_dict:\n raise DagsterInvariantViolationError(f"{self._name} has no op named {name}.")\n\n return self._node_dict[name]\n\n def get_node(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage: List[str] = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n node = self.node_named(name)\n while lineage:\n name = lineage.pop()\n # We know that this is a current node is a graph while ascending lineage\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n\n return node\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_op_defs()\n\n def iterate_node_handles(\n self, parent_node_handle: Optional[NodeHandle] = None\n ) -> Iterator[NodeHandle]:\n for node in self.node_dict.values():\n cur_node_handle = NodeHandle(node.name, parent_node_handle)\n if isinstance(node, GraphNode):\n yield from node.definition.iterate_node_handles(cur_node_handle)\n yield cur_node_handle\n\n @public\n @property\n def input_mappings(self) -> Sequence[InputMapping]:\n """Input mappings for the graph.\n\n An input mapping is a mapping from an input of the graph to an input of a child node.\n """\n return self._input_mappings\n\n @public\n @property\n def output_mappings(self) -> Sequence[OutputMapping]:\n """Output mappings for the graph.\n\n An output mapping is a mapping from an output of the graph to an output of a child node.\n """\n return self._output_mappings\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the graph, if present.\n\n By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.\n """\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name: str) -> DagsterType:\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.graph_input_name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.graph_output_name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: Optional[NodeHandle]\n ) -> Tuple[OutputDefinition, Optional[NodeHandle]]:\n check.str_param(output_name, "output_name")\n check.opt_inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_node = self.node_named(mapping.maps_from.node_name)\n return mapped_node.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, handle),\n )\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n return self.node_named(\n mapping.maps_from.node_name\n ).definition.resolve_output_to_origin_op_def(output_name)\n\n def default_value_for_input(self, input_name: str) -> object:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.node_name\n # check if input mapped to node which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to node which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.node_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n ) -> Self:\n return GraphDefinition(\n node_defs=self.node_defs,\n dependencies=self.dependencies,\n name=name or self.name,\n description=description or self.description,\n input_mappings=input_mappings or self._input_mappings,\n output_mappings=output_mappings or self._output_mappings,\n config=config or self.config_mapping,\n tags=tags or self.tags,\n node_input_source_assets=node_input_source_assets or self.node_input_source_assets,\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n ) -> "GraphDefinition":\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n f'"{self.name}" does not have a config mapping, and thus has nothing to be '\n "configured."\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return self.copy(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self) -> Sequence[str]:\n return list(self._node_dict.keys())\n\n
[docs] @public\n def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union["RunConfig", ConfigMapping, Mapping[str, object], "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, str]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[Sequence[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> "JobDefinition":\n """Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping [str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Mapping[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n asset_layer (Optional[AssetLayer]): Top level information about the assets this job\n will produce. Generally should not be set manually.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Returns:\n JobDefinition\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .job_definition import JobDefinition\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs)\n\n return JobDefinition.dagster_internal_init(\n name=name,\n description=description or self.description,\n graph_def=self,\n resource_defs=wrapped_resource_defs,\n logger_defs=logger_defs,\n executor_def=executor_def,\n config=config,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n asset_layer=asset_layer,\n input_values=input_values,\n _subset_selection_data=_asset_selection_data,\n _was_explicitly_provided_resources=None, # None means this is determined by whether resource_defs contains any explicitly provided resources\n ).get_subset(op_selection=op_selection)
\n\n def coerce_to_job(self) -> "JobDefinition":\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Mapping[str, object]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Mapping[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the graph.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n resource_defs = wrap_resources_for_execution(resources)\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self,\n executor_def=execute_in_process_executor,\n resource_defs=resource_defs,\n input_values=input_values,\n ).get_subset(op_selection=op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n\n return ephemeral_job.execute_in_process(\n run_config=run_config,\n instance=instance,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False\n\n def get_resource_requirements(\n self, asset_layer: Optional["AssetLayer"] = None\n ) -> Iterator[ResourceRequirement]:\n for node in self.node_dict.values():\n yield from node.get_resource_requirements(outer_container=self, asset_layer=asset_layer)\n\n for dagster_type in self.all_dagster_types():\n yield from dagster_type.get_resource_requirements()\n\n @public\n @property\n def name(self) -> str:\n """The name of the graph."""\n return super(GraphDefinition, self).name\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """The tags associated with the graph."""\n return super(GraphDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Aliases the graph with a new name.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.alias("my_graph_alias")\n """\n return super(GraphDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Attaches the provided tags to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.tag({"my_tag": "my_value"})\n """\n return super(GraphDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Attaches the provided hooks to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_hooks({my_hook})\n """\n return super(GraphDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Attaches the provided retry policy to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n """\n return super(GraphDefinition, self).with_retry_policy(retry_policy)
\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n all_destinations: List[NodeInputHandle] = []\n for mapping in self.input_mappings:\n if mapping.graph_input_name != input_handle.input_name:\n continue\n # recurse into graph structure\n all_destinations += self.node_named(\n mapping.maps_to.node_name\n ).definition.resolve_input_to_destinations(\n NodeInputHandle(\n NodeHandle(mapping.maps_to.node_name, parent=input_handle.node_handle),\n mapping.maps_to.input_name,\n ),\n )\n\n return all_destinations
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[Sequence[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Mapping[Union[str, NodeInvocation], Mapping[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[Sequence[NodeDefinition]],\n dependencies: Optional[\n Union[\n DependencyMapping[str],\n DependencyMapping[NodeInvocation],\n ]\n ],\n input_mappings: Optional[Sequence[InputMapping]],\n output_mappings: Optional[Sequence[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> Sequence[Node]:\n return [node for node in self.parent_graph_def.nodes if not self.has_node_named(node.name)]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: Sequence[InputMapping],\n nodes_by_name: Mapping[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Sequence[InputDefinition]:\n from .composition import MappedInputPlaceholder\n\n input_defs_by_name: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys: Set[str] = set()\n\n target_input_types_by_graph_input_name: Dict[str, Set[DagsterType]] = defaultdict(set)\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' you passed an InputDefinition "\n f"named '{mapping.name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' received unexpected type '{type(mapping)}' in"\n " input_mappings. Provide an InputMapping using InputMapping(...)"\n )\n\n input_defs_by_name[mapping.graph_input_name] = mapping.get_definition()\n\n target_node = nodes_by_name.get(mapping.maps_to.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping references node "\n f"'{mapping.maps_to.node_name}' which it does not contain."\n )\n if not target_node.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping to node '{mapping.maps_to.node_name}' "\n f"which contains no input named '{mapping.maps_to.input_name}'"\n )\n\n target_input_def = target_node.input_def_named(mapping.maps_to.input_name)\n node_input = NodeInput(target_node, target_input_def)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target"\n f' "{maps_to.node_name}.{maps_to.input_name}" (index'\n f" {maps_to.fan_in_index} of fan-in) is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(node_input)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.node_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n "the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.node_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type.get_inner_type_for_fan_in()\n )\n else:\n if dependency_structure.has_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output"\n )\n\n mapping_keys.add(f"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type\n )\n\n for node_input in dependency_structure.inputs():\n if dependency_structure.has_fan_in_deps(node_input):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(node_input)):\n if dep is MappedInputPlaceholder:\n mapping_str = f"{node_input.node_name}.{node_input.input_name}.{idx}"\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n f"Unsatisfied MappedInputPlaceholder at index {idx} in"\n " MultiDependencyDefinition for"\n f" '{node_input.node_name}.{node_input.input_name}'"\n )\n\n # if the dagster type on a graph input is Any and all its target inputs have the\n # same dagster type, then use that dagster type for the graph input\n for graph_input_name, graph_input_def in input_defs_by_name.items():\n if graph_input_def.dagster_type.kind == DagsterTypeKind.ANY:\n target_input_types = target_input_types_by_graph_input_name[graph_input_name]\n if len(target_input_types) == 1:\n input_defs_by_name[graph_input_name] = graph_input_def.with_dagster_type(\n next(iter(target_input_types))\n )\n\n return list(input_defs_by_name.values())\n\n\ndef _validate_out_mappings(\n output_mappings: Sequence[OutputMapping],\n node_dict: Mapping[str, Node],\n name: str,\n class_name: str,\n) -> Tuple[Sequence[OutputMapping], Sequence[OutputDefinition]]:\n output_defs: List[OutputDefinition] = []\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n target_node = node_dict.get(mapping.maps_from.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output mapping references node "\n f"'{mapping.maps_from.node_name}' which it does not contain."\n )\n if not target_node.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} {name} output mapping from {target_node.describe_node()} "\n f"which contains no output named '{mapping.maps_from.output_name}'"\n )\n\n target_output = target_node.output_def_named(mapping.maps_from.output_name)\n output_def = mapping.get_definition(is_dynamic=target_output.is_dynamic)\n output_defs.append(output_def)\n\n if (\n mapping.dagster_type\n and mapping.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output '{mapping.graph_output_name}' of type"\n f" {mapping.dagster_type.display_name} maps from"\n f" {mapping.maps_from.node_name}.{mapping.maps_from.output_name} of different"\n f" type {target_output.dagster_type.display_name}. OutputMapping source and"\n " destination must have the same type."\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n f"You passed an OutputDefinition named '{mapping.name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"Received unexpected type '{type(mapping)}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)"\n )\n return output_mappings, output_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/graph_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.hook_definition

\nfrom typing import AbstractSet, Any, Callable, Iterator, NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .resource_requirement import HookResourceRequirement, RequiresResources, ResourceRequirement\nfrom .utils import check_valid_name\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", PublicAttr[str]),\n ("hook_fn", PublicAttr[Callable]),\n ("required_resource_keys", PublicAttr[AbstractSet[str]]),\n ("decorated_fn", PublicAttr[Optional[Callable]]),\n ],\n ),\n RequiresResources,\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n *,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.opt_callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - JobDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .job_definition import JobDefinition\n\n if len(args) > 0 and isinstance(args[0], (JobDefinition, GraphDefinition)):\n # when it decorates a job, we apply this hook to all the op invocations within\n # the job.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n # outer_context in this case is a string of (job, job name) or (node, node name)\n attached_to = cast(Optional[str], outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield HookResourceRequirement(\n key=resource_key, attached_to=attached_to, hook_name=self.name\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/hook_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.input

\nimport inspect\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param, experimental_param\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (  # BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nT = TypeVar("T")\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name: str, dagster_type: DagsterType, default_value: T) -> T:\n    from dagster._core.types.dagster_type import BuiltinScalarDagsterType\n\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    "Type check failed for the default_value of InputDefinition "\n                    f"{input_name} of type {dagster_type.display_name}. "\n                    f"Received value {default_value} of type {type(default_value)}",\n                )\n\n    return default_value\n\n\n@experimental_param(param="asset_key")\n@experimental_param(param="asset_partitions")\nclass InputDefinition:\n    """Defines an argument to an op's compute function.\n\n    Inputs may flow from previous op outputs, or be stubbed using config. They may optionally\n    be typed using the Dagster type system.\n\n    Args:\n        name (str): Name of the input.\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n            Users should provide the Python type of the objects that they expect to be passed for\n            this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n            to be run on this input. Defaults to :py:class:`Any`.\n        description (Optional[str]): Human-readable description of the input.\n        default_value (Optional[Any]): The default value to use if no input is provided.\n        metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n        asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n            (or function that produces an AssetKey from the InputContext) which should be associated\n            with this InputDefinition. Used for tracking lineage information through Dagster.\n        asset_partitions (Optional[Union[AbstractSet[str], InputContext -> AbstractSet[str]]]): (Experimental) A\n            set of partitions of the given asset_key (or a function that produces this list of\n            partitions from the InputContext) which should be associated with this InputDefinition.\n        input_manager_key (Optional[str]): (Experimental) The resource key for the\n            :py:class:`InputManager` used for loading this input when it is not connected to an\n            upstream output.\n    """\n\n    _name: str\n    _type_not_set: bool\n    _dagster_type: DagsterType\n    _description: Optional[str]\n    _default_value: Any\n    _input_manager_key: Optional[str]\n    _raw_metadata: ArbitraryMetadataMapping\n    _metadata: Mapping[str, MetadataValue]\n    _asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]\n    _asset_partitions_fn: Optional[Callable[["InputContext"], Set[str]]]\n\n    def __init__(\n        self,\n        name: str,\n        dagster_type: object = None,\n        description: Optional[str] = None,\n        default_value: object = NoValueSentinel,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n        asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n        input_manager_key: Optional[str] = None,\n        # when adding new params, make sure to update combine_with_inferred and with_dagster_type below\n    ):\n        self._name = check_valid_name(name, allow_list=["config"])\n\n        self._type_not_set = dagster_type is None\n        self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n        self._description = check.opt_str_param(description, "description")\n\n        self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n        self._input_manager_key = check.opt_str_param(input_manager_key, "input_manager_key")\n\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n        if not callable(asset_key):\n            check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n        self._asset_key = asset_key\n\n        if asset_partitions:\n            check.param_invariant(\n                asset_key is not None,\n                "asset_partitions",\n                'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n            )\n        if callable(asset_partitions):\n            self._asset_partitions_fn = asset_partitions\n        elif asset_partitions is not None:\n            _asset_partitions = check.set_param(asset_partitions, "asset_partitions", of_type=str)\n            self._asset_partitions_fn = lambda _: _asset_partitions\n        else:\n            self._asset_partitions_fn = None\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def has_default_value(self) -> bool:\n        return self._default_value is not NoValueSentinel\n\n    @property\n    def default_value(self) -> Any:\n        check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n        return self._default_value\n\n    @property\n    def input_manager_key(self) -> Optional[str]:\n        return self._input_manager_key\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_asset(self) -> bool:\n        return self._asset_key is not None\n\n    @property\n    def hardcoded_asset_key(self) -> Optional[AssetKey]:\n        if not callable(self._asset_key):\n            return self._asset_key\n        else:\n            return None\n\n    def get_asset_key(self, context: "InputContext") -> Optional[AssetKey]:\n        """Get the AssetKey associated with this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if callable(self._asset_key):\n            return self._asset_key(context)\n        else:\n            return self.hardcoded_asset_key\n\n    def get_asset_partitions(self, context: "InputContext") -> Optional[Set[str]]:\n        """Get the set of partitions that this op will read from this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if self._asset_partitions_fn is None:\n            return None\n\n        return self._asset_partitions_fn(context)\n\n    def mapping_to(\n        self, node_name: str, input_name: str, fan_in_index: Optional[int] = None\n    ) -> "InputMapping":\n        """Create an input mapping to an input of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`InputMapping` to the input of a child node.\n\n        Args:\n            node_name (str): The name of the child node to which to map this input.\n            input_name (str): The name of the child node' input to which to map this input.\n            fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n        Examples:\n            .. code-block:: python\n\n                input_mapping = InputDefinition('composite_input', Int).mapping_to(\n                    'child_node', 'int_input'\n                )\n        """\n        check.str_param(node_name, "node_name")\n        check.str_param(input_name, "input_name")\n        check.opt_int_param(fan_in_index, "fan_in_index")\n\n        return InputMapping(\n            graph_input_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_input_name=input_name,\n            fan_in_index=fan_in_index,\n            graph_input_description=self.description,\n            dagster_type=self.dagster_type,\n        )\n\n    @staticmethod\n    def create_from_inferred(inferred: InferredInputProps) -> "InputDefinition":\n        return InputDefinition(\n            name=inferred.name,\n            dagster_type=_checked_inferred_type(inferred),\n            description=inferred.description,\n            default_value=inferred.default_value,\n        )\n\n    def combine_with_inferred(self, inferred: InferredInputProps) -> "InputDefinition":\n        """Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n        This can update: dagster_type, description, and default_value if they are not set.\n        """\n        check.invariant(\n            self.name == inferred.name,\n            f"InferredInputProps name {inferred.name} did not align with InputDefinition name"\n            f" {self.name}",\n        )\n\n        dagster_type = self._dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred)\n\n        description = self._description\n        if description is None and inferred.description is not None:\n            description = inferred.description\n\n        default_value = self._default_value\n        if not self.has_default_value:\n            default_value = inferred.default_value\n\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            default_value=default_value,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n    def with_dagster_type(self, dagster_type: DagsterType) -> "InputDefinition":\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=self.description,\n            default_value=self.default_value if self.has_default_value else NoValueSentinel,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n\ndef _checked_inferred_type(inferred: InferredInputProps) -> DagsterType:\n    try:\n        if inferred.annotation == inspect.Parameter.empty:\n            resolved_type = resolve_dagster_type(None)\n        elif inferred.annotation is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            resolved_type = resolve_dagster_type(type(None))\n        else:\n            resolved_type = resolve_dagster_type(inferred.annotation)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred.annotation}' from type annotation for argument "\n            f"'{inferred.name}', correct the issue or explicitly set the dagster_type "\n            "via In()."\n        ) from e\n\n    return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("node_name", str), ("input_name", str)])):\n    def __new__(cls, node_name: str, input_name: str):\n        return super(InputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n        )\n\n\nclass FanInInputPointer(\n    NamedTuple(\n        "_FanInInputPointer", [("node_name", str), ("input_name", str), ("fan_in_index", int)]\n    )\n):\n    def __new__(cls, node_name: str, input_name: str, fan_in_index: int):\n        return super(FanInInputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n            check.int_param(fan_in_index, "fan_in_index"),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the upstream op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass InputMapping(NamedTuple):\n """Defines an input mapping for a graph.\n\n Args:\n graph_input_name (str): Name of the input in the graph being mapped from.\n mapped_node_name (str): Named of the node (op/graph) that the input is being mapped to.\n mapped_node_input_name (str): Name of the input in the node (op/graph) that is being mapped to.\n fan_in_index (Optional[int]): The index in to a fanned input, otherwise None.\n graph_input_description (Optional[str]): A description of the input in the graph being mapped from.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's input\n being mapped from.\n\n Examples:\n .. code-block:: python\n\n from dagster import InputMapping, GraphDefinition, op, graph\n\n @op\n def needs_input(x):\n return x + 1\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[needs_input],\n input_mappings=[\n InputMapping(\n graph_input_name="maps_x", mapped_node_name="needs_input",\n mapped_node_input_name="x"\n )\n ]\n )\n\n @graph\n def the_graph(maps_x):\n needs_input(maps_x)\n """\n\n graph_input_name: str\n mapped_node_name: str\n mapped_node_input_name: str\n fan_in_index: Optional[int] = None\n graph_input_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n\n @property\n def maps_to(self) -> Union[InputPointer, FanInInputPointer]:\n if self.fan_in_index is not None:\n return FanInInputPointer(\n self.mapped_node_name, self.mapped_node_input_name, self.fan_in_index\n )\n return InputPointer(self.mapped_node_name, self.mapped_node_input_name)\n\n @property\n def maps_to_fan_in(self) -> bool:\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.graph_input_name} -> {self.maps_to.node_name}:{self.maps_to.input_name}{idx}"\n\n def get_definition(self) -> "InputDefinition":\n return InputDefinition(\n name=self.graph_input_name,\n description=self.graph_input_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("default_value", PublicAttr[Any]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n (\n "asset_key",\n PublicAttr[Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]],\n ),\n (\n "asset_partitions",\n PublicAttr[Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]],\n ),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n metadata (Optional[Dict[str, RawMetadataValue]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n input_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`InputManager` used for loading this input when it is not connected to an\n upstream output.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n input_manager_key: Optional[str] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=asset_partitions,\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition) -> "In":\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # noqa: SLF001\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # noqa: SLF001\n asset_partitions=input_def._asset_partitions_fn, # noqa: SLF001\n input_manager_key=input_def.input_manager_key,\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n input_manager_key=self.input_manager_key,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", PublicAttr[Optional[str]])])):\n """Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.job_definition

\nimport importlib\nimport os\nimport warnings\nfrom datetime import datetime\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental_param, public\nfrom dagster._config import Field, Shape, StringSource\nfrom dagster._config.config_type import ConfigType\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.dependency import (\n    Node,\n    NodeHandle,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.op_selection import OpSelection, get_graph_subset\nfrom dagster._core.definitions.partition import DynamicPartitionsDefinition\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceRequirement,\n    ensure_requirements_satisfied,\n)\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.selector.subset_selector import (\n    AssetSelectionData,\n    OpSelectionData,\n)\nfrom dagster._core.storage.io_manager import (\n    IOManagerDefinition,\n    dagster_maintained_io_manager,\n    io_manager,\n)\nfrom dagster._core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._core.utils import str_format_set\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\n\nfrom .asset_layer import AssetLayer, build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    OpNode,\n)\nfrom .executor_definition import ExecutorDefinition, multi_or_in_process_executor\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import MetadataValue, RawMetadataValue, normalize_metadata\nfrom .partition import PartitionedConfig, PartitionsDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .utils import DEFAULT_IO_MANAGER_KEY, validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._config.snap import ConfigSchemaSnapshot\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.execution.resources_init import InitResourceContext\n    from dagster._core.host_representation.job_index import JobIndex\n    from dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n    from dagster._core.snap import JobSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\nDEFAULT_EXECUTOR_DEF = multi_or_in_process_executor\n\n\n
[docs]@experimental_param(param="version_strategy")\nclass JobDefinition(IHasInternalInit):\n """Defines a Dagster job."""\n\n _name: str\n _graph_def: GraphDefinition\n _description: Optional[str]\n _tags: Mapping[str, str]\n _metadata: Mapping[str, MetadataValue]\n _current_level_node_defs: Sequence[NodeDefinition]\n _hook_defs: AbstractSet[HookDefinition]\n _op_retry_policy: Optional[RetryPolicy]\n _asset_layer: AssetLayer\n _resource_requirements: Mapping[str, AbstractSet[str]]\n _all_node_defs: Mapping[str, NodeDefinition]\n _cached_run_config_schemas: Dict[str, "RunConfigSchema"]\n _version_strategy: VersionStrategy\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]]\n input_values: Mapping[str, object]\n\n def __init__(\n self,\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n name: Optional[str] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]] = None,\n asset_layer: Optional[AssetLayer] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _was_explicitly_provided_resources: Optional[bool] = None,\n ):\n from dagster._core.definitions.run_config import RunConfig, convert_config_input\n\n self._graph_def = graph_def\n self._current_level_node_defs = self._graph_def.node_defs\n # Recursively explore all nodes in the this job\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._asset_layer = check.opt_inst_param(\n asset_layer, "asset_layer", AssetLayer\n ) or _infer_asset_layer_from_source_asset_deps(graph_def)\n\n # validates\n self._graph_def.get_inputs_must_be_resolved_top_level(self._asset_layer)\n\n self._name = check_valid_name(check.str_param(name, "name")) if name else graph_def.name\n self._executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n self._loggers = check.opt_nullable_mapping_param(\n logger_defs,\n "logger_defs",\n key_type=str,\n value_type=LoggerDefinition,\n )\n\n config = check.opt_inst_param(\n config, "config", (Mapping, ConfigMapping, PartitionedConfig, RunConfig)\n )\n config = convert_config_input(config)\n\n partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs")\n self._op_retry_policy = check.opt_inst_param(\n op_retry_policy, "op_retry_policy", RetryPolicy\n )\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n _subset_selection_data = check.opt_inst_param(\n _subset_selection_data, "_subset_selection_data", (OpSelectionData, AssetSelectionData)\n )\n input_values = check.opt_mapping_param(input_values, "input_values", key_type=str)\n\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n for key in resource_defs.keys():\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n was_provided_resources = (\n bool(resource_defs)\n if _was_explicitly_provided_resources is None\n else _was_explicitly_provided_resources\n )\n self._resource_defs = {\n DEFAULT_IO_MANAGER_KEY: default_job_io_manager,\n **resource_defs,\n }\n self._required_resource_keys = self._get_required_resource_keys(was_provided_resources)\n\n self._config_mapping = None\n self._partitioned_config = None\n self._run_config = None\n self._run_config_schema = None\n self._original_config_argument = config\n\n if partitions_def:\n self._partitioned_config = PartitionedConfig.from_flexible_config(\n config, partitions_def\n )\n else:\n if isinstance(config, ConfigMapping):\n self._config_mapping = config\n elif isinstance(config, PartitionedConfig):\n self._partitioned_config = config\n elif isinstance(config, dict):\n self._run_config = config\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n self._config_mapping = _config_mapping_with_default_value(\n get_run_config_schema_for_job(\n graph_def,\n self.resource_defs,\n self.executor_def,\n self.loggers,\n asset_layer,\n was_explicitly_provided_resources=was_provided_resources,\n ),\n config,\n self.name,\n )\n elif config is not None:\n check.failed(\n "config param must be a ConfigMapping, a PartitionedConfig, or a dictionary,"\n f" but is an object of type {type(config)}"\n )\n\n self._subset_selection_data = _subset_selection_data\n self.input_values = input_values\n for input_name in sorted(list(self.input_values.keys())):\n if not graph_def.has_input(input_name):\n raise DagsterInvalidDefinitionError(\n f"Error when constructing JobDefinition '{self.name}': Input value provided for"\n f" key '{input_name}', but job has no top-level input with that name."\n )\n\n def dagster_internal_init(\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n executor_def: Optional[ExecutorDefinition],\n logger_defs: Optional[Mapping[str, LoggerDefinition]],\n name: Optional[str],\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ],\n description: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n tags: Optional[Mapping[str, Any]],\n metadata: Optional[Mapping[str, RawMetadataValue]],\n hook_defs: Optional[AbstractSet[HookDefinition]],\n op_retry_policy: Optional[RetryPolicy],\n version_strategy: Optional[VersionStrategy],\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]],\n asset_layer: Optional[AssetLayer],\n input_values: Optional[Mapping[str, object]],\n _was_explicitly_provided_resources: Optional[bool],\n ) -> "JobDefinition":\n return JobDefinition(\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n name=name,\n config=config,\n description=description,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hook_defs,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n _subset_selection_data=_subset_selection_data,\n asset_layer=asset_layer,\n input_values=input_values,\n _was_explicitly_provided_resources=_was_explicitly_provided_resources,\n )\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def tags(self) -> Mapping[str, str]:\n return merge_dicts(self._graph_def.tags, self._tags)\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def graph(self) -> GraphDefinition:\n return self._graph_def\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._graph_def.dependencies\n\n @public\n @property\n def executor_def(self) -> ExecutorDefinition:\n """Returns the default :py:class:`ExecutorDefinition` for the job.\n\n If the user has not specified an executor definition, then this will default to the :py:func:`multi_or_in_process_executor`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n return self._executor_def or DEFAULT_EXECUTOR_DEF\n\n @public\n @property\n def has_specified_executor(self) -> bool:\n """Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._executor_def is not None\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Returns the set of ResourceDefinition objects specified on the job.\n\n This may not be the complete set of resources required by the job, since those can also be provided on the :py:class:`Definitions` object the job may be provided to.\n """\n return self._resource_defs\n\n @public\n @property\n def partitioned_config(self) -> Optional[PartitionedConfig]:\n """The partitioned config for the job, if it has one.\n\n A partitioned config defines a way to map partition keys to run config for the job.\n """\n return self._partitioned_config\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the job, if it has one.\n\n A config mapping defines a way to map a top-level config schema to run config for the job.\n """\n return self._config_mapping\n\n @public\n @property\n def loggers(self) -> Mapping[str, LoggerDefinition]:\n """Returns the set of LoggerDefinition objects specified on the job.\n\n If the user has not specified a mapping of :py:class:`LoggerDefinition` objects, then this will default to the :py:func:`colored_console_logger` under the key `console`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n from dagster._loggers import default_loggers\n\n return self._loggers or default_loggers()\n\n @public\n @property\n def has_specified_loggers(self) -> bool:\n """Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._loggers is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def run_config(self) -> Optional[Mapping[str, Any]]:\n return self._run_config\n\n @property\n def run_config_schema(self) -> "RunConfigSchema":\n if self._run_config_schema is None:\n self._run_config_schema = _create_run_config_schema(self, self.required_resource_keys)\n return self._run_config_schema\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Returns the :py:class:`PartitionsDefinition` for the job, if it has one.\n\n A partitions definition defines the set of partition keys the job operates on.\n """\n return None if not self.partitioned_config else self.partitioned_config.partitions_def\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def asset_layer(self) -> AssetLayer:\n return self._asset_layer\n\n @property\n def all_node_defs(self) -> Sequence[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_node_defs(self) -> Sequence[NodeDefinition]:\n return self._current_level_node_defs\n\n def node_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, f"{name} not found")\n return self._all_node_defs[name]\n\n def has_node(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_node(self, handle: NodeHandle) -> Node:\n return self._graph_def.get_node(handle)\n\n def get_op(self, handle: NodeHandle) -> OpNode:\n node = self.get_node(handle)\n assert isinstance(\n node, OpNode\n ), f"Tried to retrieve node {handle} as op, but it represents a nested graph."\n return node\n\n def has_node_named(self, name: str) -> bool:\n return self._graph_def.has_node_named(name)\n\n def get_node_named(self, name: str) -> Node:\n return self._graph_def.node_named(name)\n\n @property\n def nodes(self) -> Sequence[Node]:\n return self._graph_def.nodes\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._graph_def.nodes_in_topological_order\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name: str) -> bool:\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name: str) -> DagsterType:\n return self._graph_def.dagster_type_named(name)\n\n def describe_target(self) -> str:\n return f"job '{self.name}'"\n\n def is_using_memoization(self, run_tags: Mapping[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def get_required_resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.resource_defs.items()\n if resource_key in self.required_resource_keys\n }\n\n def _get_required_resource_keys(self, validate_requirements: bool = False) -> AbstractSet[str]:\n from ..execution.resources_init import get_transitive_required_resource_keys\n\n requirements = self._get_resource_requirements()\n if validate_requirements:\n ensure_requirements_satisfied(self.resource_defs, requirements)\n required_keys = {req.key for req in requirements}\n if validate_requirements:\n return required_keys.union(\n get_transitive_required_resource_keys(required_keys, self.resource_defs)\n )\n else:\n return required_keys\n\n def _get_resource_requirements(self) -> Sequence[ResourceRequirement]:\n return [\n *self._graph_def.get_resource_requirements(self.asset_layer),\n *[\n req\n for hook_def in self._hook_defs\n for req in hook_def.get_resource_requirements(outer_context=f"job '{self._name}'")\n ],\n ]\n\n def validate_resource_requirements_satisfied(self) -> None:\n resource_requirements = self._get_resource_requirements()\n ensure_requirements_satisfied(self.resource_defs, resource_requirements)\n\n def is_missing_required_resources(self) -> bool:\n requirements = self._get_resource_requirements()\n for requirement in requirements:\n if not requirement.resources_contain_key(self.resource_defs):\n return True\n return False\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> AbstractSet[HookDefinition]:\n """Gather all the hooks for the given node from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Node (node invocation)\n * JobDefinition\n\n Args:\n handle (NodeHandle): The node's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: Set[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level node\n name = lineage.pop()\n node = self._graph_def.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks on non-top-level nodes\n while lineage:\n name = lineage.pop()\n # While lineage is non-empty, definition is guaranteed to be a graph\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks applied to a job definition will run on every node\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n node = self.get_node(handle)\n definition = node.definition\n\n if node.retry_policy:\n return node.retry_policy\n elif isinstance(definition, OpDefinition) and definition.retry_policy:\n return definition.retry_policy\n\n # could be expanded to look in graph containers\n else:\n return self._op_retry_policy\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n raise DagsterInvariantViolationError(\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Optional[Union[Mapping[str, Any], "RunConfig"]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n resources: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Mapping[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[Sequence[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster._core.definitions.executor_definition import execute_in_process_executor\n from dagster._core.definitions.run_config import convert_config_input\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n asset_selection = check.opt_sequence_param(asset_selection, "asset_selection", AssetKey)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n resource_defs = wrap_resources_for_execution(resources)\n\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to"\n " execute_in_process",\n )\n\n partition_key = check.opt_str_param(partition_key, "partition_key")\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n # Combine provided input values at execute_in_process with input values\n # provided to the definition. Input values provided at\n # execute_in_process will override those provided on the definition.\n input_values = merge_dicts(self.input_values, input_values)\n\n bound_resource_defs = dict(self.resource_defs)\n ephemeral_job = JobDefinition.dagster_internal_init(\n name=self._name,\n graph_def=self._graph_def,\n resource_defs={**_swap_default_io_man(bound_resource_defs, self), **resource_defs},\n executor_def=execute_in_process_executor,\n logger_defs=self._loggers,\n hook_defs=self.hook_defs,\n config=self.config_mapping or self.partitioned_config or self.run_config,\n tags=self.tags,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n asset_layer=self.asset_layer,\n input_values=input_values,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.metadata,\n _subset_selection_data=None, # this is added below\n _was_explicitly_provided_resources=True,\n )\n\n ephemeral_job = ephemeral_job.get_subset(\n op_selection=op_selection,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n merged_tags = merge_dicts(self.tags, tags or {})\n if partition_key:\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Attempted to execute a partitioned run for a non-partitioned job")\n self.partitions_def.validate_partition_key(\n partition_key, dynamic_partitions_store=instance\n )\n\n run_config = (\n run_config\n if run_config\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n merged_tags.update(\n self.partitioned_config.get_tags_for_partition_key(\n partition_key, job_name=self.name\n )\n )\n\n return core_execute_in_process(\n ephemeral_job=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=merged_tags,\n run_id=run_id,\n asset_selection=frozenset(asset_selection),\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, OpSelectionData)\n else None\n )\n\n @property\n def asset_selection_data(self) -> Optional[AssetSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, AssetSelectionData)\n else None\n )\n\n @property\n def is_subset(self) -> bool:\n return bool(self._subset_selection_data)\n\n def get_subset(\n self,\n *,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n check.invariant(\n not (op_selection and (asset_selection or asset_check_selection)),\n "op_selection cannot be provided with asset_selection or asset_check_selection to"\n " execute_in_process",\n )\n if op_selection:\n return self._get_job_def_for_op_selection(op_selection)\n if asset_selection or asset_check_selection:\n return self._get_job_def_for_asset_selection(\n asset_selection=asset_selection, asset_check_selection=asset_check_selection\n )\n else:\n return self\n\n def _get_job_def_for_asset_selection(\n self,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n asset_selection = check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", AssetCheckKey)\n\n nonexistent_assets = [\n asset\n for asset in asset_selection\n if asset not in self.asset_layer.asset_keys\n and asset not in self.asset_layer.source_assets_by_key\n ]\n nonexistent_asset_strings = [\n asset_str\n for asset_str in (asset.to_string() for asset in nonexistent_assets)\n if asset_str\n ]\n if nonexistent_assets:\n raise DagsterInvalidSubsetError(\n "Assets provided in asset_selection argument "\n f"{', '.join(nonexistent_asset_strings)} do not exist in parent asset group or job."\n )\n\n # Test that selected asset checks exist\n all_check_keys = self.asset_layer.node_output_handles_by_asset_check_key.keys()\n\n nonexistent_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys\n ]\n nonexistent_asset_check_strings = [\n str(asset_check) for asset_check in nonexistent_asset_checks\n ]\n if nonexistent_asset_checks:\n raise DagsterInvalidSubsetError(\n "Asset checks provided in asset_check_selection argument"\n f" {', '.join(nonexistent_asset_check_strings)} do not exist in parent asset group"\n " or job."\n )\n\n asset_selection_data = AssetSelectionData(\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n parent_job_def=self,\n )\n\n check.invariant(\n self.asset_layer.assets_defs_by_key is not None,\n "Asset layer must have _asset_defs argument defined",\n )\n\n new_job = build_asset_selection_job(\n name=self.name,\n assets=set(self.asset_layer.assets_defs_by_key.values()),\n source_assets=self.asset_layer.source_assets_by_key.values(),\n executor_def=self.executor_def,\n resource_defs=self.resource_defs,\n description=self.description,\n tags=self.tags,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n asset_selection_data=asset_selection_data,\n config=self.config_mapping or self.partitioned_config,\n asset_checks=self.asset_layer.asset_checks_defs,\n )\n return new_job\n\n def _get_job_def_for_op_selection(self, op_selection: Iterable[str]) -> Self:\n try:\n sub_graph = get_graph_subset(self.graph, op_selection)\n\n # if explicit config was passed the config_mapping that resolves the defaults implicitly is\n # very unlikely to work. The job will still present the default config in the Dagster UI.\n config = (\n None\n if self.run_config is not None\n else self.config_mapping or self.partitioned_config\n )\n\n return self._copy(\n config=config,\n graph_def=sub_graph,\n _subset_selection_data=OpSelectionData(\n op_selection=list(op_selection),\n resolved_op_selection=OpSelection(op_selection).resolve(self.graph),\n parent_job_def=self, # used by job snapshot lineage\n ),\n # TODO: subset this structure.\n # https://github.com/dagster-io/dagster/issues/7541\n asset_layer=self.asset_layer,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n node_paths = OpSelection(op_selection).resolve(self.graph)\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(node_paths)} for graph "\n f"{self.graph.name} results in an invalid graph."\n ) from exc\n\n
[docs] @public\n @deprecated(\n breaking_version="2.0.0",\n additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n )\n def run_request_for_partition(\n self,\n partition_key: str,\n run_key: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional["DynamicPartitionsStore"] = None,\n ) -> RunRequest:\n """Creates a RunRequest object for a run that processes the given partition.\n\n Args:\n partition_key: The key of the partition to request a run for.\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n current_time (Optional[datetime]): Used to determine which time-partitions exist.\n Defaults to now.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n\n Returns:\n RunRequest: an object that requests a run to process the given partition.\n """\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n if (\n isinstance(self.partitions_def, DynamicPartitionsDefinition)\n and self.partitions_def.name\n ):\n # Do not support using run_request_for_partition with dynamic partitions,\n # since this requires querying the instance once per run request for the\n # existent dynamic partitions\n check.failed(\n "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n " RunRequest(partition_key=...)"\n )\n\n self.partitions_def.validate_partition_key(\n partition_key,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n run_config = (\n run_config\n if run_config is not None\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n run_request_tags = {\n **(tags or {}),\n **self.partitioned_config.get_tags_for_partition_key(\n partition_key,\n job_name=self.name,\n ),\n }\n\n return RunRequest(\n run_key=run_key,\n run_config=run_config,\n tags=run_request_tags,\n job_name=self.name,\n asset_selection=asset_selection,\n partition_key=partition_key,\n )
\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_job_snapshot().config_schema_snapshot\n\n def get_job_snapshot(self) -> "JobSnapshot":\n return self.get_job_index().job_snapshot\n\n def get_job_index(self) -> "JobIndex":\n from dagster._core.host_representation import JobIndex\n from dagster._core.snap import JobSnapshot\n\n return JobIndex(JobSnapshot.from_job_def(self), self.get_parent_job_snapshot())\n\n def get_job_snapshot_id(self) -> str:\n return self.get_job_index().job_snapshot_id\n\n def get_parent_job_snapshot(self) -> Optional["JobSnapshot"]:\n if self.op_selection_data:\n return self.op_selection_data.parent_job_def.get_job_snapshot()\n elif self.asset_selection_data:\n return self.asset_selection_data.parent_job_def.get_job_snapshot()\n else:\n return None\n\n def has_direct_input_value(self, input_name: str) -> bool:\n return input_name in self.input_values\n\n def get_direct_input_value(self, input_name: str) -> object:\n if input_name not in self.input_values:\n raise DagsterInvalidInvocationError(\n f"On job '{self.name}', attempted to retrieve input value for input named"\n f" '{input_name}', but no value was provided. Provided input values:"\n f" {sorted(list(self.input_values.keys()))}"\n )\n return self.input_values[input_name]\n\n def _copy(self, **kwargs: Any) -> "JobDefinition":\n # dict() calls copy dict props\n base_kwargs = dict(\n graph_def=self.graph,\n resource_defs=dict(self.resource_defs),\n executor_def=self._executor_def,\n logger_defs=self._loggers,\n config=self._original_config_argument,\n name=self._name,\n description=self.description,\n tags=self.tags,\n metadata=self._metadata,\n hook_defs=self.hook_defs,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n _subset_selection_data=self._subset_selection_data,\n asset_layer=self.asset_layer,\n input_values=self.input_values,\n partitions_def=self.partitions_def,\n _was_explicitly_provided_resources=None,\n )\n resolved_kwargs = {**base_kwargs, **kwargs} # base kwargs overwritten for conflicts\n job_def = JobDefinition.dagster_internal_init(**resolved_kwargs)\n update_wrapper(job_def, self, updated=())\n return job_def\n\n
[docs] @public\n def with_top_level_resources(\n self, resource_defs: Mapping[str, ResourceDefinition]\n ) -> "JobDefinition":\n """Apply a set of resources to all op instances within the job."""\n resource_defs = check.mapping_param(resource_defs, "resource_defs", key_type=str)\n return self._copy(resource_defs=resource_defs)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n return self._copy(hook_defs=(hook_defs | self.hook_defs))
\n\n def with_executor_def(self, executor_def: ExecutorDefinition) -> "JobDefinition":\n return self._copy(executor_def=executor_def)\n\n def with_logger_defs(self, logger_defs: Mapping[str, LoggerDefinition]) -> "JobDefinition":\n return self._copy(logger_defs=logger_defs)\n\n @property\n def op_selection(self) -> Optional[AbstractSet[str]]:\n return set(self.op_selection_data.op_selection) if self.op_selection_data else None\n\n @property\n def asset_selection(self) -> Optional[AbstractSet[AssetKey]]:\n return self.asset_selection_data.asset_selection if self.asset_selection_data else None\n\n @property\n def asset_check_selection(self) -> Optional[AbstractSet[AssetCheckKey]]:\n return (\n self.asset_selection_data.asset_check_selection if self.asset_selection_data else None\n )\n\n @property\n def resolved_op_selection(self) -> Optional[AbstractSet[str]]:\n return self.op_selection_data.resolved_op_selection if self.op_selection_data else None
\n\n\ndef _swap_default_io_man(resources: Mapping[str, ResourceDefinition], job: JobDefinition):\n """Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster._core.storage.mem_io_manager import mem_io_manager\n\n if (\n resources.get(DEFAULT_IO_MANAGER_KEY) in [default_job_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources[DEFAULT_IO_MANAGER_KEY] = mem_io_manager\n return updated_resources\n\n return resources\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling."\n)\ndef default_job_io_manager(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n\n # normally, default to the fs_io_manager\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n instance = check.not_none(init_context.instance)\n return PickledObjectFilesystemIOManager(base_dir=instance.storage_directory())\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n config_schema={"base_dir": Field(StringSource, is_required=False)},\n)\ndef default_job_io_manager_with_fs_io_manager_schema(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n # normally, default to the fs_io_manager\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory() if init_context.instance else None\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Mapping[str, Any],\n job_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description=(\n "This run config schema was automatically populated with default values "\n "from `default_config`."\n ),\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\ndef get_run_config_schema_for_job(\n graph_def: GraphDefinition,\n resource_defs: Mapping[str, ResourceDefinition],\n executor_def: "ExecutorDefinition",\n logger_defs: Mapping[str, LoggerDefinition],\n asset_layer: Optional[AssetLayer],\n was_explicitly_provided_resources: bool = False,\n) -> ConfigType:\n return JobDefinition(\n name=graph_def.name,\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n asset_layer=asset_layer,\n _was_explicitly_provided_resources=was_explicitly_provided_resources,\n ).run_config_schema.run_config_schema_type\n\n\ndef _infer_asset_layer_from_source_asset_deps(job_graph_def: GraphDefinition) -> AssetLayer:\n """For non-asset jobs that have some inputs that are fed from SourceAssets, constructs an\n AssetLayer that includes those SourceAssets.\n """\n asset_keys_by_node_input_handle: Dict[NodeInputHandle, AssetKey] = {}\n source_assets_list = []\n source_asset_keys_set = set()\n io_manager_keys_by_asset_key: Mapping[AssetKey, str] = {}\n\n # each entry is a graph definition and its handle relative to the job root\n stack: List[Tuple[GraphDefinition, Optional[NodeHandle]]] = [(job_graph_def, None)]\n\n while stack:\n graph_def, parent_node_handle = stack.pop()\n\n for node_name, input_source_assets in graph_def.node_input_source_assets.items():\n node_handle = NodeHandle(node_name, parent_node_handle)\n for input_name, source_asset in input_source_assets.items():\n if source_asset.key not in source_asset_keys_set:\n source_asset_keys_set.add(source_asset.key)\n source_assets_list.append(source_asset)\n\n input_handle = NodeInputHandle(node_handle, input_name)\n asset_keys_by_node_input_handle[input_handle] = source_asset.key\n for resolved_input_handle in graph_def.node_dict[\n node_name\n ].definition.resolve_input_to_destinations(input_handle):\n asset_keys_by_node_input_handle[resolved_input_handle] = source_asset.key\n\n if source_asset.io_manager_key:\n io_manager_keys_by_asset_key[source_asset.key] = source_asset.io_manager_key\n\n for node_name, node in graph_def.node_dict.items():\n if isinstance(node.definition, GraphDefinition):\n stack.append((node.definition, NodeHandle(node_name, parent_node_handle)))\n\n return AssetLayer(\n assets_defs_by_node_handle={},\n asset_keys_by_node_input_handle=asset_keys_by_node_input_handle,\n asset_info_by_node_output_handle={},\n asset_deps={},\n dependency_node_handles_by_asset_key={},\n assets_defs_by_key={},\n source_assets_by_key={\n source_asset.key: source_asset for source_asset in source_assets_list\n },\n io_manager_keys_by_asset_key=io_manager_keys_by_asset_key,\n dep_asset_keys_by_node_output_handle={},\n partition_mappings_by_asset_dep={},\n asset_checks_defs_by_node_handle={},\n node_output_handles_by_asset_check_key={},\n check_names_by_asset_key_by_node_handle={},\n check_key_by_node_output_handle={},\n )\n\n\ndef _build_all_node_defs(node_defs: Sequence[NodeDefinition]) -> Mapping[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n job_def: JobDefinition,\n required_resources: AbstractSet[str],\n) -> "RunConfigSchema":\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset job, include the missing nodes\n # from the original job as ignored to allow execution with\n # run config that is valid for the original\n ignored_nodes: Sequence[Node] = []\n if job_def.is_subset:\n if isinstance(job_def.graph, SubselectedGraphDefinition): # op selection provided\n ignored_nodes = job_def.graph.get_top_level_omitted_nodes()\n elif job_def.asset_selection_data:\n parent_job = job_def\n while parent_job.asset_selection_data:\n parent_job = parent_job.asset_selection_data.parent_job_def\n\n ignored_nodes = [\n node for node in parent_job.graph.nodes if not job_def.has_node_named(node.name)\n ]\n else:\n ignored_nodes = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n job_name=job_def.name,\n nodes=job_def.graph.nodes,\n graph_def=job_def.graph,\n dependency_structure=job_def.graph.dependency_structure,\n executor_def=job_def.executor_def,\n resource_defs=job_def.resource_defs,\n logger_defs=job_def.loggers,\n ignored_nodes=ignored_nodes,\n required_resources=required_resources,\n direct_inputs=job_def.input_values,\n asset_layer=job_def.asset_layer,\n )\n )\n\n if job_def.config_mapping:\n outer_config_type = job_def.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n job_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=job_def.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.job_definition"}, "load_assets_from_modules": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.load_assets_from_modules

\nimport inspect\nimport os\nimport pkgutil\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import Dict, Generator, Iterable, List, Optional, Sequence, Set, Tuple, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .assets import AssetsDefinition\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\nfrom .source_asset import SourceAsset\n\n\ndef _find_assets_in_module(\n    module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition], None, None]:\n    """Finds assets in the given module and adds them to the given sets of assets and source assets."""\n    for attr in dir(module):\n        value = getattr(module, attr)\n        if isinstance(value, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)):\n            yield value\n        elif isinstance(value, list) and all(\n            isinstance(el, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition))\n            for el in value\n        ):\n            yield from value\n\n\ndef assets_from_modules(\n    modules: Iterable[ModuleType], extra_source_assets: Optional[Sequence[SourceAsset]] = None\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n    """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable\n    assets from the given modules.\n\n    Args:\n        modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n        extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n            group in addition to the source assets found in the modules.\n\n    Returns:\n        Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]]:\n            A tuple containing a list of assets, a list of source assets, and a list of\n            cacheable assets defined in the given modules.\n    """\n    asset_ids: Set[int] = set()\n    asset_keys: Dict[AssetKey, ModuleType] = dict()\n    source_assets: List[SourceAsset] = list(\n        check.opt_sequence_param(extra_source_assets, "extra_source_assets", of_type=SourceAsset)\n    )\n    cacheable_assets: List[CacheableAssetsDefinition] = []\n    assets: Dict[AssetKey, AssetsDefinition] = {}\n    for module in modules:\n        for asset in _find_assets_in_module(module):\n            if id(asset) not in asset_ids:\n                asset_ids.add(id(asset))\n                if isinstance(asset, CacheableAssetsDefinition):\n                    cacheable_assets.append(asset)\n                else:\n                    keys = asset.keys if isinstance(asset, AssetsDefinition) else [asset.key]\n                    for key in keys:\n                        if key in asset_keys:\n                            modules_str = ", ".join(\n                                set([asset_keys[key].__name__, module.__name__])\n                            )\n                            error_str = (\n                                f"Asset key {key} is defined multiple times. Definitions found in"\n                                f" modules: {modules_str}. "\n                            )\n\n                            if key in assets and isinstance(asset, AssetsDefinition):\n                                if assets[key].node_def == asset.node_def:\n                                    error_str += (\n                                        "One possible cause of this bug is a call to with_resources"\n                                        " outside of a repository definition, causing a duplicate"\n                                        " asset definition."\n                                    )\n\n                            raise DagsterInvalidDefinitionError(error_str)\n                        else:\n                            asset_keys[key] = module\n                            if isinstance(asset, AssetsDefinition):\n                                assets[key] = asset\n                    if isinstance(asset, SourceAsset):\n                        source_assets.append(asset)\n    return list(set(assets.values())), source_assets, cacheable_assets\n\n\n
[docs]def load_assets_from_modules(\n modules: Iterable[ModuleType],\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets from the given modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset]]:\n A list containing assets and source assets defined in the given modules.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_modules(modules)\n\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_current_module(\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets from the module where\n this function is called.\n\n Args:\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n\n return load_assets_from_modules(\n [module],\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef assets_from_package_module(\n package_module: ModuleType,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable assets\n from the given package module.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the modules.\n\n Returns:\n Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n A tuple containing a list of assets, a list of source assets, and a list of cacheable assets\n defined in the given modules.\n """\n return assets_from_modules(\n _find_modules_in_package(package_module), extra_source_assets=extra_source_assets\n )\n\n\n
[docs]def load_assets_from_package_module(\n package_module: ModuleType,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets that includes all asset\n definitions, source assets, and cacheable assets in all sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_package_module(package_module)\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_package_name(\n package_name: str,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets that includes all asset\n definitions and source assets in all sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n package_module = import_module(package_name)\n return load_assets_from_package_module(\n package_module,\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried to find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef prefix_assets(\n assets_defs: Sequence[AssetsDefinition],\n key_prefix: CoercibleToAssetKeyPrefix,\n source_assets: Sequence[SourceAsset],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset]]:\n """Given a list of assets, prefix the input and output asset keys with key_prefix.\n The prefix is not added to source assets.\n\n Input asset keys that reference other assets within assets_defs are "brought along" -\n i.e. prefixed as well.\n\n Example with a single asset:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n result = prefixed_asset_key_replacements([asset_1], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n\n Example with dependencies within the list of assets:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n result = prefixed_asset_key_replacements([asset1, asset2], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n assert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[1].dependency_keys == {AssetKey(["my_prefix", "asset1"])}\n\n """\n asset_keys = {asset_key for assets_def in assets_defs for asset_key in assets_def.keys}\n source_asset_keys = {source_asset.key for source_asset in source_assets}\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.is_list(key_prefix, of_type=str)\n\n result_assets: List[AssetsDefinition] = []\n for assets_def in assets_defs:\n output_asset_key_replacements = {\n asset_key: AssetKey([*key_prefix, *asset_key.path]) for asset_key in assets_def.keys\n }\n input_asset_key_replacements = {}\n for dep_asset_key in assets_def.dependency_keys:\n if dep_asset_key in asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*key_prefix, *dep_asset_key.path]\n )\n elif source_key_prefix and dep_asset_key in source_asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*source_key_prefix, *dep_asset_key.path]\n )\n\n result_assets.append(\n assets_def.with_attributes(\n output_asset_key_replacements=output_asset_key_replacements,\n input_asset_key_replacements=input_asset_key_replacements,\n )\n )\n\n if source_key_prefix:\n result_source_assets = [\n source_asset.with_attributes(key=AssetKey([*source_key_prefix, *source_asset.key.path]))\n for source_asset in source_assets\n ]\n else:\n result_source_assets = source_assets\n\n return result_assets, result_source_assets\n\n\ndef assets_with_attributes(\n assets_defs: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n cacheable_assets: Sequence[CacheableAssetsDefinition],\n key_prefix: Optional[Sequence[str]],\n group_name: Optional[str],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n source_key_prefix: Optional[Sequence[str]],\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n # There is a tricky edge case here where if a non-cacheable asset depends on a cacheable asset,\n # and the assets are prefixed, the non-cacheable asset's dependency will not be prefixed since\n # at prefix-time it is not known that its dependency is one of the cacheable assets.\n # https://github.com/dagster-io/dagster/pull/10389#pullrequestreview-1170913271\n if key_prefix:\n assets_defs, source_assets = prefix_assets(\n assets_defs, key_prefix, source_assets, source_key_prefix\n )\n cacheable_assets = [\n cached_asset.with_prefix_for_all(key_prefix) for cached_asset in cacheable_assets\n ]\n\n if group_name or freshness_policy or auto_materialize_policy or backfill_policy:\n assets_defs = [\n asset.with_attributes(\n group_names_by_key=(\n {asset_key: group_name for asset_key in asset.keys} if group_name else None\n ),\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for asset in assets_defs\n ]\n if group_name:\n source_assets = [\n source_asset.with_attributes(group_name=group_name)\n for source_asset in source_assets\n ]\n cacheable_assets = [\n cached_asset.with_attributes_for_all(\n group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for cached_asset in cacheable_assets\n ]\n\n return [*assets_defs, *source_assets, *cacheable_assets]\n
", "current_page_name": "_modules/dagster/_core/definitions/load_assets_from_modules", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.load_assets_from_modules"}, "logger_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.logger_definition

\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .config import is_callable_valid_config_arg\nfrom .configurable import AnonymousConfigurableDefinition\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    import logging\n\n    from dagster._core.definitions import JobDefinition\n    from dagster._core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @public\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n """Callable[[InitLoggerContext], logging.Logger]: The function that will be invoked to\n instantiate the logger.\n """\n return self._logger_fn\n\n @public\n @property\n def config_schema(self) -> Any:\n """Any: The schema for the logger's config. Configuration data available in `init_context.logger_config`."""\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the logger."""\n return self._description\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: Any,\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n@overload\ndef logger(\n config_schema: CoercableToConfigSchema, description: Optional[str] = ...\n) -> Callable[["InitLoggerFunction"], "LoggerDefinition"]: ...\n\n\n@overload\ndef logger(\n config_schema: "InitLoggerFunction", description: Optional[str] = ...\n) -> "LoggerDefinition": ...\n\n\n
[docs]def logger(\n config_schema: Union[CoercableToConfigSchema, "InitLoggerFunction"] = None,\n description: Optional[str] = None,\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=cast("InitLoggerFunction", config_schema))\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster._core.definitions import JobDefinition\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n return UnboundInitLoggerContext(logger_config=logger_config, job_def=job_def)
\n
", "current_page_name": "_modules/dagster/_core/definitions/logger_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.logger_definition"}, "materialize": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.materialize

\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.unresolved_asset_job_definition import define_asset_job\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..errors import DagsterInvariantViolationError\nfrom ..instance import DagsterInstance\nfrom ..storage.io_manager import IOManagerDefinition\nfrom ..storage.mem_io_manager import mem_io_manager\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.events import AssetKey\n\n    from ..execution.execute_in_process_result import ExecuteInProcessResult\n\nEPHEMERAL_JOB_NAME = "__ephemeral_asset_job__"\n\n\n
[docs]def materialize(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets.\n\n By default, will materialize assets to the local filesystem.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize.\n\n Unless you're using `deps` or `non_argument_deps`, you must also include all assets that are\n upstream of the assets that you want to materialize. This is because those upstream\n asset definitions have information that is needed to load their contents while\n materializing the downstream assets.\n\n You can use the `selection` argument to distinguish between assets that you want to\n materialize and assets that are just present for loading.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. Note that if provided resources\n conflict with resources directly on assets, an error will be thrown.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset2, loading its input from asset1\n materialize([asset1, asset2], selection=[asset2])\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n partition_key = check.opt_str_param(partition_key, "partition_key")\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n all_executable_keys: Set[AssetKey] = set()\n for asset in assets:\n if isinstance(asset, AssetsDefinition):\n all_executable_keys = all_executable_keys.union(set(asset.keys))\n\n defs = Definitions(\n jobs=[define_asset_job(name=EPHEMERAL_JOB_NAME, selection=selection)],\n assets=assets,\n resources=resources,\n )\n return check.not_none(\n defs.get_job_def(EPHEMERAL_JOB_NAME),\n "This should always return a job",\n ).execute_in_process(\n run_config=run_config,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n )
\n\n\n
[docs]def materialize_to_memory(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets in memory.\n\n Will explicitly use :py:func:`mem_io_manager` for all required io manager\n keys. If any io managers are directly provided using the `resources`\n argument, a :py:class:`DagsterInvariantViolationError` will be thrown.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize. Can also provide :py:class:`SourceAsset` objects to fill dependencies for asset defs.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. If provided resources\n conflict with resources directly on assets, an error will be thrown.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset1\n materialize([asset1, asset2], selection=[asset1])\n """\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n\n # Gather all resource defs for the purpose of checking io managers.\n resources_dict = resources or {}\n all_resource_keys = set(resources_dict.keys())\n for asset in assets:\n all_resource_keys = all_resource_keys.union(asset.resource_defs.keys())\n\n io_manager_keys = _get_required_io_manager_keys(assets)\n for io_manager_key in io_manager_keys:\n if io_manager_key in all_resource_keys:\n raise DagsterInvariantViolationError(\n "Attempted to call `materialize_to_memory` with a resource "\n f"provided for io manager key '{io_manager_key}'. Do not "\n "provide resources for io manager keys when calling "\n "`materialize_to_memory`, as it will override io management "\n "behavior for all keys."\n )\n\n resource_defs = merge_dicts({key: mem_io_manager for key in io_manager_keys}, resources_dict)\n\n return materialize(\n assets=assets,\n run_config=run_config,\n resources=resource_defs,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n selection=selection,\n )
\n\n\ndef _get_required_io_manager_keys(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]]\n) -> Set[str]:\n io_manager_keys = set()\n for asset in assets:\n for requirement in asset.get_resource_requirements():\n if requirement.expected_type == IOManagerDefinition:\n io_manager_keys.add(requirement.key)\n return io_manager_keys\n
", "current_page_name": "_modules/dagster/_core/definitions/materialize", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.materialize"}, "metadata": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata

\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self, TypeAlias, TypeVar\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, experimental, public\nfrom dagster._core.errors import DagsterInvalidMetadata\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import (\n    FieldSerializer,\n    PackableValue,\n    UnpackContext,\n    WhitelistMap,\n    pack_value,\n)\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom .table import (  # re-exported\n    TableColumn as TableColumn,\n    TableColumnConstraints as TableColumnConstraints,\n    TableConstraints as TableConstraints,\n    TableRecord as TableRecord,\n    TableSchema as TableSchema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import AssetKey\n\nArbitraryMetadataMapping: TypeAlias = Mapping[str, Any]\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    TableSchema,\n    "AssetKey",\n    os.PathLike,\n    Dict[Any, Any],\n    float,\n    int,\n    List[Any],\n    str,\n    None,\n]\n\nMetadataMapping: TypeAlias = Mapping[str, "MetadataValue"]\nMetadataUserInput: TypeAlias = Mapping[str, RawMetadataValue]\n\nT_Packable = TypeVar("T_Packable", bound=PackableValue, default=PackableValue, covariant=True)\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    allow_invalid: bool = False,\n) -> Mapping[str, "MetadataValue"]:\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    normalized_metadata: Dict[str, MetadataValue] = {}\n    for k, v in metadata.items():\n        try:\n            normalized_value = normalize_metadata_value(v)\n        except DagsterInvalidMetadata as e:\n            if allow_invalid:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "2.0.0",\n                    additional_warn_text=(\n                        "In the future, all user-supplied metadata values must be one of"\n                        f" {RawMetadataValue}"\n                    ),\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                normalized_value = TextMetadataValue(f"[{v.__class__.__name__}] (unserializable)")\n            else:\n                raise DagsterInvalidMetadata(\n                    f'Could not resolve the metadata value for "{k}" to a known type. {e}'\n                ) from None\n        normalized_metadata[k] = normalized_value\n\n    return normalized_metadata\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue) -> "MetadataValue[Any]":\n    from dagster._core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, bool):\n        return MetadataValue.bool(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, (list, dict)):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n    elif raw_value is None:\n        return MetadataValue.null()\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue(ABC, Generic[T_Packable]):\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in the Dagster UI and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n @public\n @property\n @abstractmethod\n def value(self) -> T_Packable:\n """The wrapped value."""\n raise NotImplementedError()\n\n
[docs] @public\n @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @public\n @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @public\n @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def notebook(path: Union[str, os.PathLike]) -> "NotebookMetadataValue":\n """Static constructor for a metadata value wrapping a notebook path as\n :py:class:`NotebookMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n }\n )\n\n Args:\n path (str): The path to a notebook for a metadata entry.\n """\n return NotebookMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def json(data: Union[Sequence[Any], Mapping[str, Any]]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a json-serializable list or dict\n as :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Union[Sequence[Any], Mapping[str, Any]]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @public\n @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n return FloatMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n return IntMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def bool(value: bool) -> "BoolMetadataValue":\n """Static constructor for a metadata value wrapping a bool as\n :py:class:`BoolMetadataValuye`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n },\n )\n\n Args:\n value (bool): The bool value for a metadata entry.\n """\n return BoolMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def dagster_run(run_id: str) -> "DagsterRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return DagsterRunMetadataValue(run_id)
\n\n
[docs] @public\n @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n from dagster._core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @public\n @staticmethod\n @experimental\n def table(\n records: Sequence[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"),\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @public\n @staticmethod\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n
[docs] @public\n @staticmethod\n def null() -> "NullMetadataValue":\n """Static constructor for a metadata value representing null. Can be used as the value type\n for the `metadata` parameter for supported events.\n """\n return NullMetadataValue()
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue(\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped text data."""\n return self.text
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue(\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped URL."""\n return self.url
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue(\n NamedTuple("_PathMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="NotebookMetadataEntryData")\nclass NotebookMetadataValue(\n NamedTuple("_NotebookMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for notebook metadata entry data.\n\n Args:\n path (Optional[str]): The path to the notebook as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(NotebookMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path to the notebook as a string."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", PublicAttr[Optional[Union[Sequence[Any], Mapping[str, Any]]]]),\n ],\n ),\n MetadataValue[Union[Sequence[Any], Mapping[str, Any]]],\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Union[Sequence[Any], Dict[str, Any]]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Union[Sequence[Any], Mapping[str, Any]]]):\n data = check.opt_inst_param(data, "data", (Sequence, Mapping))\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)\n\n @public\n @property\n def value(self) -> Optional[Union[Sequence[Any], Mapping[str, Any]]]:\n """Optional[Union[Sequence[Any], Dict[str, Any]]]: The wrapped JSON data."""\n return self.data
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped markdown as a string."""\n return self.md_str
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", PublicAttr[str]),\n ("name", PublicAttr[str]),\n ],\n ),\n MetadataValue["PythonArtifactMetadataValue"],\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )\n\n @public\n @property\n def value(self) -> Self:\n """PythonArtifactMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", PublicAttr[Optional[float]]),\n ],\n ),\n MetadataValue[float],\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", PublicAttr[Optional[int]]),\n ],\n ),\n MetadataValue[int],\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n@whitelist_for_serdes(storage_name="BoolMetadataEntryData")\nclass BoolMetadataValue(\n NamedTuple("_BoolMetadataValue", [("value", PublicAttr[Optional[bool]])]),\n MetadataValue[bool],\n):\n """Container class for bool metadata entry data.\n\n Args:\n value (Optional[bool]): The bool value.\n """\n\n def __new__(cls, value: Optional[bool]):\n return super(BoolMetadataValue, cls).__new__(cls, check.opt_bool_param(value, "value"))\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterRunMetadataValue(\n NamedTuple(\n "_DagsterRunMetadataValue",\n [\n ("run_id", PublicAttr[str]),\n ],\n ),\n MetadataValue[str],\n):\n """Representation of a dagster run.\n\n Args:\n run_id (str): The run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterRunMetadataValue, cls).__new__(cls, check.str_param(run_id, "run_id"))\n\n @public\n @property\n def value(self) -> str:\n """str: The wrapped run id."""\n return self.run_id
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", PublicAttr["AssetKey"])]),\n MetadataValue["AssetKey"],\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster._core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n @public\n @property\n def value(self) -> "AssetKey":\n """AssetKey: The wrapped :py:class:`AssetKey`."""\n return self.asset_key
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", PublicAttr[Sequence[TableRecord]]),\n ("schema", PublicAttr[TableSchema]),\n ],\n ),\n MetadataValue["TableMetadataValue"],\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n
[docs] @public\n @staticmethod\n def infer_column_type(value: object) -> str:\n """str: Infer the :py:class:`TableSchema` column type that will be used for a value."""\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"
\n\n def __new__(cls, records: Sequence[TableRecord], schema: Optional[TableSchema]):\n check.sequence_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )\n\n @public\n @property\n def value(self) -> Self:\n """TableMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", PublicAttr[TableSchema])]),\n MetadataValue[TableSchema],\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )\n\n @public\n @property\n def value(self) -> TableSchema:\n """TableSchema: The wrapped :py:class:`TableSchema`."""\n return self.schema
\n\n\n@whitelist_for_serdes(storage_name="NullMetadataEntryData")\nclass NullMetadataValue(NamedTuple("_NullMetadataValue", []), MetadataValue[None]):\n """Representation of null."""\n\n @public\n @property\n def value(self) -> None:\n """None: The wrapped null value."""\n return None\n\n\n# ########################\n# ##### METADATA BACKCOMPAT\n# ########################\n\n# Metadata used to be represented as a `List[MetadataEntry]`, but that class has been deleted. But\n# we still serialize metadata dicts to the serialized representation of `List[MetadataEntry]` for\n# backcompat purposes.\n\n\nclass MetadataFieldSerializer(FieldSerializer):\n """Converts between metadata dict (new) and metadata entries list (old)."""\n\n storage_name = "metadata_entries"\n loaded_name = "metadata"\n\n def pack(\n self,\n metadata_dict: Mapping[str, MetadataValue],\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Sequence[Mapping[str, Any]]:\n return [\n {\n "__class__": "EventMetadataEntry",\n "label": k,\n # MetadataValue itself can't inherit from NamedTuple and so isn't a PackableValue,\n # but one of its subclasses will always be returned here.\n "entry_data": pack_value(v, whitelist_map, descent_path), # type: ignore\n "description": None,\n }\n for k, v in metadata_dict.items()\n ]\n\n def unpack(\n self,\n metadata_entries: List["MetadataEntry"],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> Mapping[str, MetadataValue]:\n return {e.label: e.entry_data for e in metadata_entries}\n\n\nT_MetadataValue = TypeVar("T_MetadataValue", bound=MetadataValue, covariant=True)\n\n\n# NOTE: MetadataEntry is no longer accessible via the public API-- all metadata APIs use metadata\n# dicts. This clas shas only been preserved to adhere strictly to our backcompat guarantees. It is\n# still instantiated in the above `MetadataFieldSerializer` but that can easily be changed.\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use a dict with `MetadataValue` values instead.",\n)\n@deprecated_param(\n param="entry_data", breaking_version="2.0", additional_warn_text="Use `value` instead."\n)\n@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("entry_data", PublicAttr[MetadataValue]),\n ],\n ),\n Generic[T_MetadataValue],\n):\n """A structure for describing metadata for Dagster events.\n\n .. note:: This class is no longer usable in any Dagster API, and will be completely removed in 2.0.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in the Dagster UI and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like the Dagster UI.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n value = cast(\n RawMetadataValue,\n normalize_renamed_param(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data
\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "table": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata.table

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableRecord(\n NamedTuple("TableRecord", [("data", PublicAttr[Mapping[str, Union[str, int, float, bool]]])])\n):\n """Represents one record in a table. Field keys are arbitrary strings-- field values must be\n strings, integers, floats, or bools.\n """\n\n def __new__(cls, data: Mapping[str, Union[str, int, float, bool]]):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", PublicAttr[Sequence["TableColumn"]]),\n ("constraints", PublicAttr["TableConstraints"]),\n ],\n )\n):\n """Representation of a schema for tabular data.\n\n Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: Sequence["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.sequence_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )\n\n
[docs] @public\n @staticmethod\n def from_name_type_dict(name_type_dict: Mapping[str, str]):\n """Constructs a TableSchema from a dictionary whose keys are column names and values are the\n names of data types of those columns.\n """\n return TableSchema(\n columns=[\n TableColumn(name=name, type=type_str) for name, type_str in name_type_dict.items()\n ]\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", PublicAttr[Sequence[str]]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: Sequence[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.sequence_param(other, "other", of_type=str),\n )
\n\n\n_DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", PublicAttr[str]),\n ("type", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("constraints", PublicAttr["TableColumnConstraints"]),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # noqa: A002\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", PublicAttr[bool]),\n ("unique", PublicAttr[bool]),\n ("other", PublicAttr[Optional[Sequence[str]]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[Sequence[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_sequence_param(other, "other"),\n )
\n\n\n_DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata/table", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.definitions.metadata"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.metadata.table"}, "title": "dagster._core.definitions.metadata"}, "multi_asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_asset_sensor_definition

\nimport inspect\nimport json\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._utils import normalize_to_repository\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SensorResult, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n    from dagster._core.storage.event_log.base import EventLogRecord\n\nMAX_NUM_UNCONSUMED_EVENTS = 25\n\n\nclass MultiAssetSensorAssetCursorComponent(\n    NamedTuple(\n        "_MultiAssetSensorAssetCursorComponent",\n        [\n            ("latest_consumed_event_partition", Optional[str]),\n            ("latest_consumed_event_id", Optional[int]),\n            ("trailing_unconsumed_partitioned_event_ids", Dict[str, int]),\n        ],\n    )\n):\n    """A cursor component that is used to track the cursor for a particular asset in a multi-asset\n    sensor.\n\n    Here's an illustration to help explain how this representation works:\n\n    partition_1  ---|----------a----\n    partition_2  -t-----|-x---------\n    partition_3  ----t------|---a---\n\n\n    The "|", "a", "t", and "x" characters represent materialization events.\n    The x-axis is storage_id, which is basically time. The cursor has been advanced to the "|" event\n    for each partition. latest_evaluated_event_partition would be "partition_3", and\n    "latest_evaluated_event_id" would be the storage_id of the "|" event for partition_3.\n\n    The "t" events aren't directly represented in the cursor, because they trail the event that the\n    the cursor for their partition has advanced to. The "a" events aren't directly represented\n    in the cursor, because they occurred after the "latest_evaluated_event_id".  The "x" event is\n    included in "unevaluated_partitioned_event_ids", because it's after the event that the cursor\n    for its partition has advanced to, but trails "latest_evaluated_event_id".\n\n    Attributes:\n        latest_consumed_event_partition (Optional[str]): The partition of the latest consumed event\n            for this asset.\n        latest_consumed_event_id (Optional[int]): The event ID of the latest consumed event for\n            this asset.\n        trailing_unconsumed_partitioned_event_ids (Dict[str, int]): A mapping containing\n            the partition key mapped to the latest unconsumed materialization event for this\n            partition with an ID less than latest_consumed_event_id.\n    """\n\n    def __new__(\n        cls,\n        latest_consumed_event_partition,\n        latest_consumed_event_id,\n        trailing_unconsumed_partitioned_event_ids,\n    ):\n        return super(MultiAssetSensorAssetCursorComponent, cls).__new__(\n            cls,\n            latest_consumed_event_partition=check.opt_str_param(\n                latest_consumed_event_partition, "latest_consumed_event_partition"\n            ),\n            latest_consumed_event_id=check.opt_int_param(\n                latest_consumed_event_id, "latest_consumed_event_id"\n            ),\n            trailing_unconsumed_partitioned_event_ids=check.dict_param(\n                trailing_unconsumed_partitioned_event_ids,\n                "trailing_unconsumed_partitioned_event_ids",\n                key_type=str,\n                value_type=int,\n            ),\n        )\n\n\nclass MultiAssetSensorContextCursor:\n    # Tracks the state of the cursor within the tick, created for utility purposes.\n    # Must call MultiAssetSensorEvaluationContext._update_cursor_after_evaluation at end of tick\n    # to serialize the cursor.\n    def __init__(self, cursor: Optional[str], context: "MultiAssetSensorEvaluationContext"):\n        loaded_cursor = json.loads(cursor) if cursor else {}\n        self._cursor_component_by_asset_key: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n        # The initial latest consumed event ID at the beginning of the tick\n        self.initial_latest_consumed_event_ids_by_asset_key: Dict[str, Optional[int]] = {}\n\n        for str_asset_key, cursor_list in loaded_cursor.items():\n            if len(cursor_list) != 3:\n                # In this case, the cursor object is not a multi asset sensor asset cursor\n                # component. This cursor is maintained by the asset reconciliation sensor.\n                break\n            else:\n                partition_key, event_id, trailing_unconsumed_partitioned_event_ids = cursor_list\n                self._cursor_component_by_asset_key[str_asset_key] = (\n                    MultiAssetSensorAssetCursorComponent(\n                        latest_consumed_event_partition=partition_key,\n                        latest_consumed_event_id=event_id,\n                        trailing_unconsumed_partitioned_event_ids=trailing_unconsumed_partitioned_event_ids,\n                    )\n                )\n\n                self.initial_latest_consumed_event_ids_by_asset_key[str_asset_key] = event_id\n\n        check.dict_param(self._cursor_component_by_asset_key, "unpacked_cursor", key_type=str)\n        self._context = context\n\n    def get_cursor_for_asset(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n        return self._cursor_component_by_asset_key.get(\n            str(asset_key), MultiAssetSensorAssetCursorComponent(None, None, {})\n        )\n\n    def get_stringified_cursor(self) -> str:\n        return json.dumps(self._cursor_component_by_asset_key)\n\n\n
[docs]@experimental\nclass MultiAssetSensorEvaluationContext(SensorEvaluationContext):\n """The context object available as the argument to the evaluation function of a\n :py:class:`dagster.MultiAssetSensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_multi_asset_sensor_context`.\n\n The `MultiAssetSensorEvaluationContext` contains a cursor object that tracks the state of\n consumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\n of the latest materialization that has been marked as "consumed" (via a call to `advance_cursor`)\n in a `latest_consumed_event_id` field.\n\n For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\n partitions. Each event ID must be before the `latest_consumed_event_id` field for the asset.\n\n Events marked as consumed via `advance_cursor` will be returned in future ticks until they\n are marked as consumed.\n\n To update the cursor to the latest materialization and clear the unconsumed events, call\n `advance_all_cursors`.\n\n Attributes:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\n strings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\n trailing_unconsumed_partitioned_event_ids).\n last_completion_time (float): DEPRECATED The last time that the sensor was consumed (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Example:\n .. code-block:: python\n\n from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n @multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\n def the_sensor(context: MultiAssetSensorEvaluationContext):\n ...\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"],\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n instance: Optional[DagsterInstance] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n self._monitored_asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n repo_assets = self._repository_def.assets_defs_by_key.values()\n repo_source_assets = self._repository_def.source_assets_by_key.values()\n self._monitored_asset_keys = list(\n monitored_assets.resolve([*repo_assets, *repo_source_assets])\n )\n else:\n self._monitored_asset_keys = monitored_assets\n\n self._assets_by_key: Dict[AssetKey, Optional[AssetsDefinition]] = {}\n self._partitions_def_by_asset_key: Dict[AssetKey, Optional[PartitionsDefinition]] = {}\n for asset_key in self._monitored_asset_keys:\n assets_def = self._repository_def.assets_defs_by_key.get(asset_key)\n self._assets_by_key[asset_key] = assets_def\n\n source_asset_def = self._repository_def.source_assets_by_key.get(asset_key)\n self._partitions_def_by_asset_key[asset_key] = (\n assets_def.partitions_def\n if assets_def\n else source_asset_def.partitions_def if source_asset_def else None\n )\n\n # Cursor object with utility methods for updating and retrieving cursor information.\n # At the end of each tick, must call update_cursor_after_evaluation to update the serialized\n # cursor.\n self._unpacked_cursor = MultiAssetSensorContextCursor(cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n\n self._initial_unconsumed_events_by_id: Dict[int, EventLogRecord] = {}\n self._fetched_initial_unconsumed_events = False\n\n super(MultiAssetSensorEvaluationContext, self).__init__(\n instance_ref=instance_ref,\n last_completion_time=last_completion_time,\n last_run_key=last_run_key,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n resources=resource_defs,\n )\n\n def _cache_initial_unconsumed_events(self) -> None:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n # This method caches the initial unconsumed events for each asset key. To generate the\n # current unconsumed events, call get_trailing_unconsumed_events instead.\n if self._fetched_initial_unconsumed_events:\n return\n\n for asset_key in self._monitored_asset_keys:\n unconsumed_event_ids = list(\n self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values()\n )\n if unconsumed_event_ids:\n event_records = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n storage_ids=unconsumed_event_ids,\n )\n )\n self._initial_unconsumed_events_by_id.update(\n {event_record.storage_id: event_record for event_record in event_records}\n )\n\n self._fetched_initial_unconsumed_events = True\n\n def _get_unconsumed_events_with_ids(\n self, event_ids: Sequence[int]\n ) -> Sequence["EventLogRecord"]:\n self._cache_initial_unconsumed_events()\n unconsumed_events = []\n for event_id in sorted(event_ids):\n event = self._initial_unconsumed_events_by_id.get(event_id)\n unconsumed_events.extend([event] if event else [])\n\n return unconsumed_events\n\n
[docs] @public\n def get_trailing_unconsumed_events(self, asset_key: AssetKey) -> Sequence["EventLogRecord"]:\n """Fetches the unconsumed events for a given asset key. Returns only events\n before the latest consumed event ID for the given asset. To mark an event as consumed,\n pass the event to `advance_cursor`. Returns events in ascending order by storage ID.\n\n Args:\n asset_key (AssetKey): The asset key to get unconsumed events for.\n\n Returns:\n Sequence[EventLogRecord]: The unconsumed events for the given asset key.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )
\n\n def _get_partitions_after_cursor(self, asset_key: AssetKey) -> Sequence[str]:\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(f"No partitions defined for asset key {asset_key}")\n\n partitions_to_fetch = list(\n partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n )\n\n if partition_key is not None:\n # Return partitions after the cursor partition, not including the cursor partition\n partitions_to_fetch = partitions_to_fetch[\n partitions_to_fetch.index(partition_key) + 1 :\n ]\n return partitions_to_fetch\n\n def update_cursor_after_evaluation(self) -> None:\n """Updates the cursor after the sensor evaluation function has been called. This method\n should be called at most once per evaluation.\n """\n new_cursor = self._cursor_advance_state_mutation.get_cursor_with_advances(\n self, self._unpacked_cursor\n )\n\n if new_cursor is not None:\n # Cursor was not updated by this context object, so we do not need to update it\n self._cursor = new_cursor\n self._unpacked_cursor = MultiAssetSensorContextCursor(new_cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n self._fetched_initial_unconsumed_events = False\n\n
[docs] @public\n def latest_materialization_records_by_key(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n ) -> Mapping[AssetKey, Optional["EventLogRecord"]]:\n """Fetches the most recent materialization event record for each asset in asset_keys.\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): list of asset keys to fetch events for. If\n not specified, the latest materialization will be fetched for all assets the\n multi_asset_sensor monitors.\n\n Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest\n materialization event for the asset. If there is no materialization event for the asset,\n the value in the mapping will be None.\n """\n # Do not evaluate unconsumed events, only events newer than the cursor\n # if there are no new events after the cursor, the cursor points to the most\n # recent event.\n\n if asset_keys is None:\n asset_keys = self._monitored_asset_keys\n else:\n asset_keys = check.opt_sequence_param(asset_keys, "asset_keys", of_type=AssetKey)\n\n asset_records = self.instance.get_asset_records(asset_keys)\n\n asset_event_records: Dict[AssetKey, Optional[EventLogRecord]] = {\n asset_key: None for asset_key in asset_keys\n }\n for record in asset_records:\n if (\n record.asset_entry.last_materialization_record\n and record.asset_entry.last_materialization_record.storage_id\n > (self._get_cursor(record.asset_entry.asset_key).latest_consumed_event_id or 0)\n ):\n asset_event_records[record.asset_entry.asset_key] = (\n record.asset_entry.last_materialization_record\n )\n\n return asset_event_records
\n\n
[docs] @public\n def materialization_records_for_key(\n self, asset_key: AssetKey, limit: Optional[int] = None\n ) -> Iterable["EventLogRecord"]:\n """Fetches asset materialization event records for asset_key, with the earliest event first.\n\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_key (AssetKey): The asset to fetch materialization events for\n limit (Optional[int]): The number of events to fetch\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(f"Asset key {asset_key} not monitored by sensor.")\n\n events = list(\n self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n limit=limit,\n )\n )\n\n return events
\n\n def _get_cursor(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n """Returns the MultiAssetSensorAssetCursorComponent for the asset key.\n\n For more information, view the docstring for the MultiAssetSensorAssetCursorComponent class.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._unpacked_cursor.get_cursor_for_asset(asset_key)\n\n
[docs] @public\n def latest_materialization_records_by_partition(\n self,\n asset_key: AssetKey,\n after_cursor_partition: Optional[bool] = False,\n ) -> Mapping[str, "EventLogRecord"]:\n """Given an asset, returns a mapping of partition key to the latest materialization event\n for that partition. Fetches only materializations that have not been marked as "consumed"\n via a call to `advance_cursor`.\n\n Args:\n asset_key (AssetKey): The asset to fetch events for.\n after_cursor_partition (Optional[bool]): If True, only materializations with partitions\n after the cursor's current partition will be returned. By default, set to False.\n\n Returns:\n Mapping[str, EventLogRecord]:\n Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\n EventLogRecord is the most recent materialization event for the partition.\n The mapping preserves the order that the materializations occurred.\n\n Example:\n .. code-block:: python\n\n @asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\n def july_asset():\n return 1\n\n @multi_asset_sensor(asset_keys=[july_asset.key])\n def my_sensor(context):\n context.latest_materialization_records_by_partition(july_asset.key)\n\n # After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n # returns {"2022-07-05": EventLogRecord(...)}\n\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventLogRecord, EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor definition"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvariantViolationError(\n "Cannot get latest materialization by partition for assets with no partitions"\n )\n\n partitions_to_fetch = (\n self._get_partitions_after_cursor(asset_key)\n if after_cursor_partition\n else list(partitions_def.get_partition_keys(dynamic_partitions_store=self.instance))\n )\n\n # Retain ordering of materializations\n materialization_by_partition: Dict[str, EventLogRecord] = OrderedDict()\n\n # Add unconsumed events to the materialization by partition dictionary\n # These events came before the cursor, so should be inserted in storage ID ascending order\n for unconsumed_event in sorted(\n self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )\n ):\n partition = unconsumed_event.partition_key\n if isinstance(partition, str) and partition in partitions_to_fetch:\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = unconsumed_event\n\n partition_materializations = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions_to_fetch,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n )\n for materialization in partition_materializations:\n partition = materialization.partition_key\n\n if isinstance(partition, str):\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = materialization\n\n return materialization_by_partition
\n\n
[docs] @public\n def latest_materialization_records_by_partition_and_asset(\n self,\n ) -> Mapping[str, Mapping[AssetKey, "EventLogRecord"]]:\n """Finds the most recent unconsumed materialization for each partition for each asset\n monitored by the sensor. Aggregates all materializations into a mapping of partition key\n to a mapping of asset key to the materialization event for that partition.\n\n For example, if the sensor monitors two partitioned assets A and B that are materialized\n for partition_x after the cursor, this function returns:\n\n .. code-block:: python\n\n {\n "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n }\n\n This method can only be called when all monitored assets are partitioned and share\n the same partition definition.\n """\n partitions_defs = list(self._partitions_def_by_asset_key.values())\n if not partitions_defs or not all(x == partitions_defs[0] for x in partitions_defs):\n raise DagsterInvalidInvocationError(\n "All assets must be partitioned and share the same partitions definition"\n )\n\n asset_and_materialization_tuple_by_partition: Dict[\n str, Dict[AssetKey, "EventLogRecord"]\n ] = defaultdict(dict)\n\n for asset_key in self._monitored_asset_keys:\n materialization_by_partition = self.latest_materialization_records_by_partition(\n asset_key\n )\n for partition, materialization in materialization_by_partition.items():\n asset_and_materialization_tuple_by_partition[partition][asset_key] = materialization\n\n return asset_and_materialization_tuple_by_partition
\n\n
[docs] @public\n def get_cursor_partition(self, asset_key: Optional[AssetKey]) -> Optional[str]:\n """A utility method to get the current partition the cursor is on."""\n asset_key = check.opt_inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvalidInvocationError(\n "Provided asset key must correspond to a provided asset"\n )\n if asset_key:\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n elif self._monitored_asset_keys is not None and len(self._monitored_asset_keys) == 1:\n partition_key = self._get_cursor(\n self._monitored_asset_keys[0]\n ).latest_consumed_event_partition\n else:\n raise DagsterInvalidInvocationError(\n "Asset key must be provided when multiple assets are defined"\n )\n\n return partition_key
\n\n
[docs] @public\n def all_partitions_materialized(\n self, asset_key: AssetKey, partitions: Optional[Sequence[str]] = None\n ) -> bool:\n """A utility method to check if a provided list of partitions have been materialized\n for a particular asset. This method ignores the cursor and checks all materializations\n for the asset.\n\n Args:\n asset_key (AssetKey): The asset to check partitions for.\n partitions (Optional[Sequence[str]]): A list of partitions to check. If not provided,\n all partitions for the asset will be checked.\n\n Returns:\n bool: True if all selected partitions have been materialized, False otherwise.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n if partitions is not None:\n check.sequence_param(partitions, "partitions", of_type=str)\n if len(partitions) == 0:\n raise DagsterInvalidInvocationError("Must provide at least one partition in list")\n\n materialized_partitions = self.instance.get_materialized_partitions(asset_key)\n if not partitions:\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} not monitored by sensor"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not partitions_def:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} is not partitioned. Cannot check if partitions have"\n " been materialized."\n )\n partitions = partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n\n return all([partition in materialized_partitions for partition in partitions])
\n\n def _get_asset(self, asset_key: AssetKey, fn_name: str) -> AssetsDefinition:\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n repo_def = cast(RepositoryDefinition, self._repository_def)\n repository_assets = repo_def.assets_defs_by_key\n if asset_key in self._assets_by_key:\n asset_def = self._assets_by_key[asset_key]\n if asset_def is None:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} does not have an AssetDefinition in this repository"\n f" (likely because it is a SourceAsset). fn context.{fn_name} can only be"\n " called for assets with AssetDefinitions in the repository."\n )\n else:\n return asset_def\n elif asset_key in repository_assets:\n return repository_assets[asset_key]\n else:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor and does not exist in target jobs"\n )\n\n
[docs] @public\n def get_downstream_partition_keys(\n self, partition_key: str, from_asset_key: AssetKey, to_asset_key: AssetKey\n ) -> Sequence[str]:\n """Converts a partition key from one asset to the corresponding partition key in a downstream\n asset. Uses the existing partition mapping between the upstream asset and the downstream\n asset if it exists, otherwise, uses the default partition mapping.\n\n Args:\n partition_key (str): The partition key to convert.\n from_asset_key (AssetKey): The asset key of the upstream asset, which the provided\n partition key belongs to.\n to_asset_key (AssetKey): The asset key of the downstream asset. The provided partition\n key will be mapped to partitions within this asset.\n\n Returns:\n Sequence[str]: A list of the corresponding downstream partitions in to_asset_key that\n partition_key maps to.\n """\n partition_key = check.str_param(partition_key, "partition_key")\n\n to_asset = self._get_asset(to_asset_key, fn_name="get_downstream_partition_keys")\n from_asset = self._get_asset(from_asset_key, fn_name="get_downstream_partition_keys")\n\n to_partitions_def = to_asset.partitions_def\n\n if not isinstance(to_partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {to_asset_key} is not partitioned. Cannot get partition keys."\n )\n if not isinstance(from_asset.partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {from_asset_key} is not partitioned. Cannot get partition keys."\n )\n\n partition_mapping = to_asset.infer_partition_mapping(\n from_asset_key, from_asset.partitions_def\n )\n downstream_partition_key_subset = (\n partition_mapping.get_downstream_partitions_for_partitions(\n from_asset.partitions_def.empty_subset().with_partition_keys([partition_key]),\n downstream_partitions_def=to_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n return list(downstream_partition_key_subset.get_partition_keys())
\n\n
[docs] @public\n def advance_cursor(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n """Marks the provided materialization records as having been consumed by the sensor.\n\n At the end of the tick, the cursor will be updated to advance past all materializations\n records provided via `advance_cursor`. In the next tick, records that have been consumed\n will no longer be returned.\n\n Passing a partitioned materialization record into this function will mark prior materializations\n with the same asset key and partition as having been consumed.\n\n Args:\n materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]): Mapping of\n AssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\n for the AssetKey will be updated and future calls to fetch asset materialization events\n will not fetch this event again. If None is provided, the cursor for the AssetKey\n will not be updated.\n """\n self._cursor_advance_state_mutation.add_advanced_records(materialization_records_by_key)\n self._cursor_updated = True
\n\n
[docs] @public\n def advance_all_cursors(self):\n """Updates the cursor to the most recent materialization event for all assets monitored by\n the multi_asset_sensor.\n\n Marks all materialization events as consumed by the sensor, including unconsumed events.\n """\n materializations_by_key = self.latest_materialization_records_by_key()\n\n self._cursor_advance_state_mutation.add_advanced_records(materializations_by_key)\n self._cursor_advance_state_mutation.advance_all_cursors_called = True\n self._cursor_updated = True
\n\n @public\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, Optional[AssetsDefinition]]:\n """Mapping[AssetKey, Optional[AssetsDefinition]]: A mapping from AssetKey to the\n AssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\n is not produced within the same code location as this sensor, then the value will be None.\n """\n return self._assets_by_key\n\n @public\n @property\n def asset_keys(self) -> Sequence[AssetKey]:\n """Sequence[AssetKey]: The asset keys which are monitored by this sensor."""\n return self._monitored_asset_keys
\n\n\nclass MultiAssetSensorCursorAdvances:\n _advanced_record_ids_by_key: Dict[AssetKey, Set[int]]\n _partition_key_by_record_id: Dict[int, Optional[str]]\n advance_all_cursors_called: bool\n\n def __init__(self):\n self._advanced_record_ids_by_key = defaultdict(set)\n self._partition_key_by_record_id = {}\n self.advance_all_cursors_called = False\n\n def add_advanced_records(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n for asset_key, materialization in materialization_records_by_key.items():\n if materialization:\n self._advanced_record_ids_by_key[asset_key].add(materialization.storage_id)\n\n self._partition_key_by_record_id[materialization.storage_id] = (\n materialization.partition_key\n )\n\n def get_cursor_with_advances(\n self,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> Optional[str]:\n """Given the multi asset sensor context and the cursor at the start of the tick,\n returns the cursor that should be used in the next tick.\n\n If the cursor has not been updated, returns None\n """\n if len(self._advanced_record_ids_by_key) == 0:\n # No events marked as advanced\n return None\n\n return json.dumps(\n {\n str(asset_key): self.get_asset_cursor_with_advances(\n asset_key, context, initial_cursor\n )\n for asset_key in context.asset_keys\n }\n )\n\n def get_asset_cursor_with_advances(\n self,\n asset_key: AssetKey,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> MultiAssetSensorAssetCursorComponent:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n advanced_records: Set[int] = self._advanced_record_ids_by_key.get(asset_key, set())\n if len(advanced_records) == 0:\n # No events marked as advanced for this asset key\n return initial_cursor.get_cursor_for_asset(asset_key)\n\n initial_asset_cursor = initial_cursor.get_cursor_for_asset(asset_key)\n\n latest_consumed_event_id_at_tick_start = initial_asset_cursor.latest_consumed_event_id\n\n greatest_consumed_event_id_in_tick = max(advanced_records)\n latest_consumed_partition_in_tick = self._partition_key_by_record_id[\n greatest_consumed_event_id_in_tick\n ]\n latest_unconsumed_record_by_partition: Dict[str, int] = {}\n\n if not self.advance_all_cursors_called:\n latest_unconsumed_record_by_partition = (\n initial_asset_cursor.trailing_unconsumed_partitioned_event_ids\n )\n unconsumed_events = list(context.get_trailing_unconsumed_events(asset_key)) + list(\n context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=latest_consumed_event_id_at_tick_start,\n before_cursor=greatest_consumed_event_id_in_tick,\n ),\n ascending=True,\n )\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else []\n )\n\n # Iterate through events in ascending order, storing the latest unconsumed\n # event for each partition. If an advanced event exists for a partition, clear\n # the prior unconsumed event for that partition.\n for event in unconsumed_events:\n partition = event.partition_key\n if partition is not None: # Ignore unpartitioned events\n if event.storage_id not in advanced_records:\n latest_unconsumed_record_by_partition[partition] = event.storage_id\n elif partition in latest_unconsumed_record_by_partition:\n latest_unconsumed_record_by_partition.pop(partition)\n\n if (\n latest_consumed_partition_in_tick is not None\n and latest_consumed_partition_in_tick in latest_unconsumed_record_by_partition\n ):\n latest_unconsumed_record_by_partition.pop(latest_consumed_partition_in_tick)\n\n if len(latest_unconsumed_record_by_partition.keys()) >= MAX_NUM_UNCONSUMED_EVENTS:\n raise DagsterInvariantViolationError(f"""\n You have reached the maximum number of trailing unconsumed events\n ({MAX_NUM_UNCONSUMED_EVENTS}) for asset {asset_key} and no more events can be\n added. You can access the unconsumed events by calling the\n `get_trailing_unconsumed_events` method on the sensor context, and\n mark events as consumed by passing them to `advance_cursor`.\n\n Otherwise, you can clear all unconsumed events and reset the cursor to the latest\n materialization for each asset by calling `advance_all_cursors`.\n """)\n\n return MultiAssetSensorAssetCursorComponent(\n latest_consumed_event_partition=(\n latest_consumed_partition_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else initial_asset_cursor.latest_consumed_event_partition\n ),\n latest_consumed_event_id=(\n greatest_consumed_event_id_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else latest_consumed_event_id_at_tick_start\n ),\n trailing_unconsumed_partitioned_event_ids=latest_unconsumed_record_by_partition,\n )\n\n\ndef get_cursor_from_latest_materializations(\n asset_keys: Sequence[AssetKey], instance: DagsterInstance\n) -> str:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n cursor_dict: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n for asset_key in asset_keys:\n materializations = instance.get_event_records(\n EventRecordsFilter(\n DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n ),\n limit=1,\n )\n if materializations:\n last_materialization = list(materializations)[-1]\n\n cursor_dict[str(asset_key)] = MultiAssetSensorAssetCursorComponent(\n last_materialization.partition_key,\n last_materialization.storage_id,\n {},\n )\n\n cursor_str = json.dumps(cursor_dict)\n return cursor_str\n\n\n
[docs]@experimental\ndef build_multi_asset_sensor_context(\n *,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n cursor_from_latest_materializations: bool = False,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n) -> MultiAssetSensorEvaluationContext:\n """Builds multi asset sensor execution context for testing purposes using the provided parameters.\n\n This function can be used to provide a context to the invocation of a multi asset sensor definition. If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (RepositoryDefinition): `RepositoryDefinition` object that\n the sensor is defined in. Must provide `definitions` if this is not provided.\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A string cursor to provide to the evaluation of the sensor. Must be\n a dictionary of asset key strings to ints that has been converted to a json string\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n cursor_from_latest_materializations (bool): If True, the cursor will be set to the latest\n materialization for each monitored asset. By default, set to False.\n resources (Optional[Mapping[str, object]]): The resource definitions\n to provide to the sensor.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n Must provide `repository_def` if this is not provided.\n\n Examples:\n .. code-block:: python\n\n with instance_for_test() as instance:\n context = build_multi_asset_sensor_context(\n monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n instance=instance,\n )\n my_asset_sensor(context)\n\n """\n from dagster._core.definitions import RepositoryDefinition\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n\n check.bool_param(cursor_from_latest_materializations, "cursor_from_latest_materializations")\n\n if cursor_from_latest_materializations:\n if cursor:\n raise DagsterInvalidInvocationError(\n "Cannot provide both cursor and cursor_from_latest_materializations objects."\n " Dagster will override the provided cursor based on the"\n " cursor_from_latest_materializations object."\n )\n if not instance:\n raise DagsterInvalidInvocationError(\n "Cannot provide cursor_from_latest_materializations object without a Dagster"\n " instance."\n )\n\n asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n asset_keys = cast(\n List[AssetKey],\n list(\n monitored_assets.resolve(list(set(repository_def.assets_defs_by_key.values())))\n ),\n )\n else:\n asset_keys = monitored_assets\n\n cursor = get_cursor_from_latest_materializations(asset_keys, instance)\n\n return MultiAssetSensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n monitored_assets=monitored_assets,\n repository_def=repository_def,\n resource_defs=wrap_resources_for_execution(resources),\n )
\n\n\nAssetMaterializationFunctionReturn = Union[\n Iterator[Union[RunRequest, SkipReason, SensorResult]],\n Sequence[RunRequest],\n RunRequest,\n SkipReason,\n None,\n SensorResult,\n]\nAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\nMultiAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\n\n
[docs]@experimental\nclass MultiAssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a list of\n assets.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorDefinition`, use :py:func:`dagster.\n multi_asset_sensor`.\n\n Args:\n name (str): The name of the sensor to create.\n asset_keys (Sequence[AssetKey]): The asset_keys this sensor monitors.\n asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def __init__(\n self,\n name: str,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n job_name: Optional[str],\n asset_materialization_fn: MultiAssetMaterializationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n def _check_cursor_not_set(sensor_result: SensorResult):\n if sensor_result.cursor:\n raise DagsterInvariantViolationError(\n "Cannot set cursor in a multi_asset_sensor. Cursor is set automatically"\n " based on the latest materialization for each monitored asset."\n )\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n with MultiAssetSensorEvaluationContext(\n instance_ref=context.instance_ref,\n last_completion_time=context.last_completion_time,\n last_run_key=context.last_run_key,\n cursor=context.cursor,\n repository_name=context.repository_def.name,\n repository_def=context.repository_def,\n monitored_assets=monitored_assets,\n instance=context.instance,\n resource_defs=context.resource_defs,\n ) as multi_asset_sensor_context:\n context_param_name = get_context_param_name(materialization_fn)\n context_param = (\n {context_param_name: multi_asset_sensor_context}\n if context_param_name\n else {}\n )\n result = materialization_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is None:\n return\n\n # because the materialization_fn can yield results (see _wrapped_fn in multi_asset_sensor decorator),\n # even if you return None in a sensor, it will still cause in inspect.isgenerator(result) to be True.\n # So keep track to see if we actually return any values and should update the cursor\n runs_yielded = False\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n if isinstance(item, RunRequest):\n runs_yielded = True\n if isinstance(item, SensorResult):\n raise DagsterInvariantViolationError(\n "Cannot yield a SensorResult from a multi_asset_sensor. Instead"\n " return the SensorResult."\n )\n yield item\n elif isinstance(result, RunRequest):\n runs_yielded = True\n yield result\n elif isinstance(result, SkipReason):\n # if result is a SkipReason, we don't update the cursor, so don't set runs_yielded = True\n yield result\n elif isinstance(result, SensorResult):\n _check_cursor_not_set(result)\n if result.run_requests:\n runs_yielded = True\n yield result\n\n if runs_yielded and not multi_asset_sensor_context.cursor_updated:\n raise DagsterInvalidDefinitionError(\n "Asset materializations have been handled in this sensor, but the cursor"\n " was not updated. This means the same materialization events will be"\n " handled in the next sensor tick. Use context.advance_cursor or"\n " context.advance_all_cursors to update the cursor."\n )\n\n multi_asset_sensor_context.update_cursor_after_evaluation()\n context.update_cursor(multi_asset_sensor_context.cursor)\n\n return _fn\n\n self._raw_asset_materialization_fn = asset_materialization_fn\n\n super(MultiAssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn")\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=request_assets,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> AssetMaterializationFunctionReturn:\n context_param_name = get_context_param_name(self._raw_asset_materialization_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._raw_asset_materialization_fn,\n args,\n kwargs,\n context_type=MultiAssetSensorEvaluationContext,\n )\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n context_param = {context_param_name: context} if context_param_name and context else {}\n result = self._raw_asset_materialization_fn(**context_param, **resources)\n\n if context:\n context.update_cursor_after_evaluation()\n return result\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.MULTI_ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_asset_sensor_definition"}, "multi_dimensional_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_dimensional_partitions

\nimport hashlib\nimport itertools\nfrom datetime import datetime\nfrom functools import lru_cache, reduce\nfrom typing import (\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.tags import (\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    get_multidimensional_partition_tag,\n)\n\nfrom .partition import (\n    DefaultPartitionsSubset,\n    DynamicPartitionsDefinition,\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\nINVALID_STATIC_PARTITIONS_KEY_CHARACTERS = set(["|", ",", "[", "]"])\n\nMULTIPARTITION_KEY_DELIMITER = "|"\n\n\nclass PartitionDimensionKey(\n    NamedTuple("_PartitionDimensionKey", [("dimension_name", str), ("partition_key", str)])\n):\n    """Representation of a single dimension of a multi-dimensional partition key."""\n\n    def __new__(cls, dimension_name: str, partition_key: str):\n        return super(PartitionDimensionKey, cls).__new__(\n            cls,\n            dimension_name=check.str_param(dimension_name, "dimension_name"),\n            partition_key=check.str_param(partition_key, "partition_key"),\n        )\n\n\n
[docs]class MultiPartitionKey(str):\n """A multi-dimensional partition key stores the partition key for each dimension.\n Subclasses the string class to keep partition key type as a string.\n\n Contains additional methods to access the partition key for each dimension.\n Creates a string representation of the partition key for each dimension, separated by a pipe (|).\n Orders the dimensions by name, to ensure consistent string representation.\n """\n\n dimension_keys: List[PartitionDimensionKey] = []\n\n def __new__(cls, keys_by_dimension: Mapping[str, str]):\n check.mapping_param(\n keys_by_dimension, "partitions_by_dimension", key_type=str, value_type=str\n )\n\n dimension_keys: List[PartitionDimensionKey] = [\n PartitionDimensionKey(dimension, keys_by_dimension[dimension])\n for dimension in sorted(list(keys_by_dimension.keys()))\n ]\n\n str_key = super(MultiPartitionKey, cls).__new__(\n cls,\n MULTIPARTITION_KEY_DELIMITER.join(\n [dim_key.partition_key for dim_key in dimension_keys]\n ),\n )\n\n str_key.dimension_keys = dimension_keys\n\n return str_key\n\n def __getnewargs__(self):\n # When this instance is pickled, replace the argument to __new__ with the\n # dimension key mapping instead of the string representation.\n return ({dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys},)\n\n @property\n def keys_by_dimension(self) -> Mapping[str, str]:\n return {dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys}
\n\n\nclass PartitionDimensionDefinition(\n NamedTuple(\n "_PartitionDimensionDefinition",\n [\n ("name", str),\n ("partitions_def", PartitionsDefinition),\n ],\n )\n):\n def __new__(\n cls,\n name: str,\n partitions_def: PartitionsDefinition,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n partitions_def=check.inst_param(partitions_def, "partitions_def", PartitionsDefinition),\n )\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, PartitionDimensionDefinition)\n and self.name == other.name\n and self.partitions_def == other.partitions_def\n )\n\n\nALLOWED_PARTITION_DIMENSION_TYPES = (\n StaticPartitionsDefinition,\n TimeWindowPartitionsDefinition,\n DynamicPartitionsDefinition,\n)\n\n\ndef _check_valid_partitions_dimensions(\n partitions_dimensions: Mapping[str, PartitionsDefinition]\n) -> None:\n for dim_name, partitions_def in partitions_dimensions.items():\n if not any(isinstance(partitions_def, t) for t in ALLOWED_PARTITION_DIMENSION_TYPES):\n raise DagsterInvalidDefinitionError(\n f"Invalid partitions definition type {type(partitions_def)}. "\n "Only the following partitions definition types are supported: "\n f"{ALLOWED_PARTITION_DIMENSION_TYPES}."\n )\n if isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name is None:\n raise DagsterInvalidDefinitionError(\n "DynamicPartitionsDefinition must have a name to be used in a"\n " MultiPartitionsDefinition."\n )\n\n if isinstance(partitions_def, StaticPartitionsDefinition):\n if any(\n [\n INVALID_STATIC_PARTITIONS_KEY_CHARACTERS & set(key)\n for key in partitions_def.get_partition_keys()\n ]\n ):\n raise DagsterInvalidDefinitionError(\n f"Invalid character in partition key for dimension {dim_name}. "\n "A multi-partitions definition cannot contain partition keys with "\n "the following characters: |, [, ], ,"\n )\n\n\n
[docs]class MultiPartitionsDefinition(PartitionsDefinition[MultiPartitionKey]):\n """Takes the cross-product of partitions from two partitions definitions.\n\n For example, with a static partitions definition where the partitions are ["a", "b", "c"]\n and a daily partitions definition, this partitions definition will have the following\n partitions:\n\n 2020-01-01|a\n 2020-01-01|b\n 2020-01-01|c\n 2020-01-02|a\n 2020-01-02|b\n ...\n\n Args:\n partitions_defs (Mapping[str, PartitionsDefinition]):\n A mapping of dimension name to partitions definition. The total set of partitions will\n be the cross-product of the partitions from each PartitionsDefinition.\n\n Attributes:\n partitions_defs (Sequence[PartitionDimensionDefinition]):\n A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\n name and a PartitionsDefinition. The total set of partitions will be the cross-product\n of the partitions from each PartitionsDefinition. This sequence is ordered by\n dimension name, to ensure consistent ordering of the partitions.\n """\n\n def __init__(self, partitions_defs: Mapping[str, PartitionsDefinition]):\n if not len(partitions_defs.keys()) == 2:\n raise DagsterInvalidInvocationError(\n "Dagster currently only supports multi-partitions definitions with 2 partitions"\n " definitions. Your multi-partitions definition has"\n f" {len(partitions_defs.keys())} partitions definitions."\n )\n check.mapping_param(\n partitions_defs, "partitions_defs", key_type=str, value_type=PartitionsDefinition\n )\n\n _check_valid_partitions_dimensions(partitions_defs)\n\n self._partitions_defs: List[PartitionDimensionDefinition] = sorted(\n [\n PartitionDimensionDefinition(name, partitions_def)\n for name, partitions_def in partitions_defs.items()\n ],\n key=lambda x: x.name,\n )\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return MultiPartitionsSubset\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n str(\n {\n dim_def.name: dim_def.partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n )\n for dim_def in self.partitions_defs\n }\n ).encode("utf-8")\n ).hexdigest()\n\n @property\n def partition_dimension_names(self) -> List[str]:\n return [dim_def.name for dim_def in self._partitions_defs]\n\n @property\n def partitions_defs(self) -> Sequence[PartitionDimensionDefinition]:\n return self._partitions_defs\n\n def get_partitions_def_for_dimension(self, dimension_name: str) -> PartitionsDefinition:\n for dim_def in self._partitions_defs:\n if dim_def.name == dimension_name:\n return dim_def.partitions_def\n check.failed(f"Invalid dimension name {dimension_name}")\n\n # We override the default implementation of `has_partition_key` for performance.\n def has_partition_key(\n self,\n partition_key: Union[MultiPartitionKey, str],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n partition_key = (\n partition_key\n if isinstance(partition_key, MultiPartitionKey)\n else self.get_partition_key_from_str(partition_key)\n )\n if partition_key.keys_by_dimension.keys() != set(self.partition_dimension_names):\n raise DagsterUnknownPartitionError(\n f"Invalid partition key {partition_key}. The dimensions of the partition key are"\n " not the dimensions of the partitions definition."\n )\n\n for dimension in self.partitions_defs:\n if not dimension.partitions_def.has_partition_key(\n partition_key.keys_by_dimension[dimension.name],\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ):\n return False\n return True\n\n # store results for repeated calls with the same current_time\n @lru_cache(maxsize=1)\n def _get_partition_keys(\n self, current_time: datetime, dynamic_partitions_store: Optional[DynamicPartitionsStore]\n ) -> Sequence[MultiPartitionKey]:\n partition_key_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in self._partitions_defs\n ]\n\n return [\n MultiPartitionKey(\n {self._partitions_defs[i].name: key for i, key in enumerate(partition_key_tuple)}\n )\n for partition_key_tuple in itertools.product(*partition_key_sequences)\n ]\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[MultiPartitionKey]:\n """Returns a list of MultiPartitionKeys representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partition dimensions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when a\n dimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\n DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[MultiPartitionKey]\n """\n return self._get_partition_keys(\n current_time or pendulum.now("UTC"), dynamic_partitions_store\n )
\n\n def filter_valid_partition_keys(\n self, partition_keys: Set[str], dynamic_partitions_store: DynamicPartitionsStore\n ) -> Set[MultiPartitionKey]:\n partition_keys_by_dimension = {\n dim.name: dim.partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n }\n validated_partitions = set()\n for partition_key in partition_keys:\n partition_key_strs = partition_key.split(MULTIPARTITION_KEY_DELIMITER)\n if len(partition_key_strs) != len(self.partitions_defs):\n continue\n\n multipartition_key = MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n if all(\n key in partition_keys_by_dimension.get(dim, [])\n for dim, key in multipartition_key.keys_by_dimension.items()\n ):\n validated_partitions.add(partition_key)\n\n return validated_partitions\n\n def __eq__(self, other):\n return (\n isinstance(other, MultiPartitionsDefinition)\n and self.partitions_defs == other.partitions_defs\n )\n\n def __hash__(self):\n return hash(\n tuple(\n [\n (partitions_def.name, partitions_def.__repr__())\n for partitions_def in self.partitions_defs\n ]\n )\n )\n\n def __str__(self) -> str:\n dimension_1 = self._partitions_defs[0]\n dimension_2 = self._partitions_defs[1]\n partition_str = (\n "Multi-partitioned, with dimensions: \\n"\n f"{dimension_1.name.capitalize()}: {dimension_1.partitions_def} \\n"\n f"{dimension_2.name.capitalize()}: {dimension_2.partitions_def}"\n )\n return partition_str\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(dimensions={[str(dim) for dim in self.partitions_defs]}"\n\n def get_partition_key_from_str(self, partition_key_str: str) -> MultiPartitionKey:\n """Given a string representation of a partition key, returns a MultiPartitionKey object."""\n check.str_param(partition_key_str, "partition_key_str")\n\n partition_key_strs = partition_key_str.split(MULTIPARTITION_KEY_DELIMITER)\n check.invariant(\n len(partition_key_strs) == len(self.partitions_defs),\n f"Expected {len(self.partitions_defs)} partition keys in partition key string"\n f" {partition_key_str}, but got {len(partition_key_strs)}",\n )\n\n return MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n def _get_primary_and_secondary_dimension(\n self,\n ) -> Tuple[PartitionDimensionDefinition, PartitionDimensionDefinition]:\n # Multipartitions subsets are serialized by primary dimension. If changing\n # the selection of primary/secondary dimension, will need to also update the\n # serialization of MultiPartitionsSubsets\n\n time_dimensions = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_dimensions) == 1:\n primary_dimension, secondary_dimension = time_dimensions[0], next(\n iter([dim for dim in self.partitions_defs if dim != time_dimensions[0]])\n )\n else:\n primary_dimension, secondary_dimension = (\n self.partitions_defs[0],\n self.partitions_defs[1],\n )\n\n return primary_dimension, secondary_dimension\n\n @property\n def primary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[0]\n\n @property\n def secondary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[1]\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n partition_key = cast(MultiPartitionKey, self.get_partition_key_from_str(partition_key))\n tags = {**super().get_tags_for_partition_key(partition_key)}\n tags.update(get_tags_from_multi_partition_key(partition_key))\n return tags\n\n @property\n def time_window_dimension(self) -> PartitionDimensionDefinition:\n time_window_dims = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n check.invariant(\n len(time_window_dims) == 1, "Expected exactly one time window partitioned dimension"\n )\n return next(iter(time_window_dims))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n if not isinstance(partition_key, MultiPartitionKey):\n partition_key = self.get_partition_key_from_str(partition_key)\n\n time_window_dimension = self.time_window_dimension\n return cast(\n TimeWindowPartitionsDefinition, time_window_dimension.partitions_def\n ).time_window_for_partition_key(\n cast(MultiPartitionKey, partition_key).keys_by_dimension[time_window_dimension.name]\n )\n\n def get_multipartition_keys_with_dimension_value(\n self,\n dimension_name: str,\n dimension_partition_key: str,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Sequence[MultiPartitionKey]:\n check.str_param(dimension_name, "dimension_name")\n check.str_param(dimension_partition_key, "dimension_partition_key")\n\n matching_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name == dimension_name\n ]\n other_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name != dimension_name\n ]\n\n check.invariant(\n len(matching_dimensions) == 1,\n f"Dimension {dimension_name} not found in MultiPartitionsDefinition with dimensions"\n f" {[dim.name for dim in self.partitions_defs]}",\n )\n\n partition_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in other_dimensions\n ] + [[dimension_partition_key]]\n\n # Names of partitions dimensions in the same order as partition_sequences\n partition_dim_names = [dim.name for dim in other_dimensions] + [dimension_name]\n\n return [\n MultiPartitionKey(\n {\n partition_dim_names[i]: partition_key\n for i, partition_key in enumerate(partitions_tuple)\n }\n )\n for partitions_tuple in itertools.product(*partition_sequences)\n ]\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Static partitions definitions can contain duplicate keys (will throw error in 1.3.0)\n # In the meantime, relying on get_num_partitions to handle duplicates to display\n # correct counts in the Dagster UI.\n dimension_counts = [\n dim.partitions_def.get_num_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n ]\n return reduce(lambda x, y: x * y, dimension_counts, 1)
\n\n\nclass MultiPartitionsSubset(DefaultPartitionsSubset):\n def __init__(\n self,\n partitions_def: MultiPartitionsDefinition,\n subset: Optional[Set[str]] = None,\n ):\n check.inst_param(partitions_def, "partitions_def", MultiPartitionsDefinition)\n subset = (\n set(\n [\n partitions_def.get_partition_key_from_str(key)\n for key in subset\n if MULTIPARTITION_KEY_DELIMITER in key\n ]\n )\n if subset\n else set()\n )\n super(MultiPartitionsSubset, self).__init__(partitions_def, subset)\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "MultiPartitionsSubset":\n return MultiPartitionsSubset(\n cast(MultiPartitionsDefinition, self._partitions_def),\n self._subset | set(partition_keys),\n )\n\n\ndef get_tags_from_multi_partition_key(multi_partition_key: MultiPartitionKey) -> Mapping[str, str]:\n check.inst_param(multi_partition_key, "multi_partition_key", MultiPartitionKey)\n\n return {\n get_multidimensional_partition_tag(dimension.dimension_name): dimension.partition_key\n for dimension in multi_partition_key.dimension_keys\n }\n\n\ndef get_multipartition_key_from_tags(tags: Mapping[str, str]) -> str:\n partitions_by_dimension: Dict[str, str] = {}\n for tag in tags:\n if tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX):\n dimension = tag[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]\n partitions_by_dimension[dimension] = tags[tag]\n\n return MultiPartitionKey(partitions_by_dimension)\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_dimensional_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_dimensional_partitions"}, "op_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.op_definition

\nimport inspect\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.dependency import NodeHandle, NodeInputHandle\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    InputManagerRequirement,\n    OpDefinitionResourceRequirement,\n    OutputManagerRequirement,\n    ResourceRequirement,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .hook_definition import HookDefinition\nfrom .inference import infer_output_props\nfrom .input import In, InputDefinition\nfrom .output import Out, OutputDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_layer import AssetLayer\n\n    from .composition import PendingNodeInvocation\n    from .decorators.op_decorator import DecoratedOpFunction\n\nOpComputeFunction: TypeAlias = Callable[..., Any]\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead."\n)\nclass OpDefinition(NodeDefinition, IHasInternalInit):\n """Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the op. If set,\n this is used as a default code version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n ins={"num": In(int)},\n outs={"result": Out(int)},\n compute_fn=_add_one,\n )\n """\n\n _compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"]\n _config_schema: IDefinitionConfigSchema\n _required_resource_keys: AbstractSet[str]\n _version: Optional[str]\n _retry_policy: Optional[RetryPolicy]\n\n def __init__(\n self,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n ):\n from .decorators.op_decorator import DecoratedOpFunction, resolve_checked_op_fn_inputs\n\n ins = check.opt_mapping_param(ins, "ins")\n input_defs = [\n inp.to_definition(name) for name, inp in sorted(ins.items(), key=lambda inp: inp[0])\n ] # sort so that input definition order is deterministic\n\n if isinstance(compute_fn, DecoratedOpFunction):\n resolved_input_defs: Sequence[InputDefinition] = resolve_checked_op_fn_inputs(\n decorator_name="@op",\n fn_name=name,\n compute_fn=cast(DecoratedOpFunction, compute_fn),\n explicit_input_defs=input_defs,\n exclude_nothing=True,\n )\n self._compute_fn = compute_fn\n _validate_context_type_hint(self._compute_fn.decorated_fn)\n else:\n resolved_input_defs = input_defs\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n _validate_context_type_hint(self._compute_fn)\n\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n self._version = code_version\n\n check.opt_mapping_param(outs, "outs")\n output_defs = _resolve_output_defs_from_outs(\n compute_fn=compute_fn, outs=outs, default_code_version=code_version\n )\n\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedOpFunction)\n else None\n )\n\n super(OpDefinition, self).__init__(\n name=name,\n input_defs=check.sequence_param(resolved_input_defs, "input_defs", InputDefinition),\n output_defs=check.sequence_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def dagster_internal_init(\n *,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]],\n outs: Optional[Mapping[str, Out]],\n description: Optional[str],\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]],\n required_resource_keys: Optional[AbstractSet[str]],\n tags: Optional[Mapping[str, Any]],\n version: Optional[str],\n retry_policy: Optional[RetryPolicy],\n code_version: Optional[str],\n ) -> "OpDefinition":\n return OpDefinition(\n compute_fn=compute_fn,\n name=name,\n ins=ins,\n outs=outs,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n code_version=code_version,\n )\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this op."""\n return super(OpDefinition, self).name\n\n @public\n @property\n def ins(self) -> Mapping[str, In]:\n """Mapping[str, In]: A mapping from input name to the In object that represents that input."""\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @public\n @property\n def outs(self) -> Mapping[str, Out]:\n """Mapping[str, Out]: A mapping from output name to the Out object that represents that output."""\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedOpFunction"]:\n return self._compute_fn\n\n @public\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n """IDefinitionConfigSchema: The config schema for this op."""\n return self._config_schema\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """AbstractSet[str]: A set of keys for resources that must be provided to this OpDefinition."""\n return frozenset(self._required_resource_keys)\n\n @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use `code_version` instead.")\n @property\n def version(self) -> Optional[str]:\n """str: Version of the code encapsulated by the op. If set, this is used as a\n default code version for all outputs.\n """\n return self._version\n\n @public\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n """Optional[RetryPolicy]: The RetryPolicy for this op."""\n return self._retry_policy\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for this op."""\n return super(OpDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given name."""\n return super(OpDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given tags."""\n return super(OpDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given hook definitions."""\n return super(OpDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given retry policy."""\n return super(OpDefinition, self).with_retry_policy(retry_policy)
\n\n def is_from_decorator(self) -> bool:\n from .decorators.op_decorator import DecoratedOpFunction\n\n return isinstance(self._compute_fn, DecoratedOpFunction)\n\n def get_output_annotation(self) -> Any:\n if not self.is_from_decorator():\n raise DagsterInvalidInvocationError(\n f"Attempted to get output annotation for {self.node_type_str} '{self.name}', "\n "which was not constructed from a decorated function."\n )\n return cast("DecoratedOpFunction", self.compute_fn).get_output_annotation()\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n yield self\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: T_Handle\n ) -> Tuple[OutputDefinition, T_Handle]:\n return self.output_def_named(output_name), handle\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n return self\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n handle = cast(NodeHandle, check.inst_param(handle, "handle", NodeHandle))\n unresolveable_input_defs = []\n for input_def in self.input_defs:\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.has_default_value\n and not input_def.input_manager_key\n ):\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n # If input_asset_key is present, this input can be resolved\n # by a source asset, so input does not need to be resolved\n # at the top level.\n if input_asset_key:\n continue\n unresolveable_input_defs.append(input_def)\n return unresolveable_input_defs\n\n def input_has_default(self, input_name: str) -> bool:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def with_replaced_properties(\n self,\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[IDefinitionConfigSchema] = None,\n description: Optional[str] = None,\n ) -> "OpDefinition":\n return OpDefinition.dagster_internal_init(\n name=name,\n ins=ins\n or {input_def.name: In.from_definition(input_def) for input_def in self.input_defs},\n outs=outs\n or {\n output_def.name: Out.from_definition(output_def) for output_def in self.output_defs\n },\n compute_fn=self.compute_fn,\n config_schema=config_schema or self.config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n code_version=self._version,\n retry_policy=self.retry_policy,\n version=None, # code_version replaces version\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n ) -> "OpDefinition":\n return self.with_replaced_properties(\n name=name,\n description=description,\n config_schema=config_schema,\n )\n\n def get_resource_requirements(\n self,\n outer_context: Optional[object] = None,\n ) -> Iterator[ResourceRequirement]:\n # Outer requiree in this context is the outer-calling node handle. If not provided, then\n # just use the op name.\n outer_context = cast(Optional[Tuple[NodeHandle, Optional["AssetLayer"]]], outer_context)\n if not outer_context:\n handle = None\n asset_layer = None\n else:\n handle, asset_layer = outer_context\n node_description = f"{self.node_type_str} '{handle or self.name}'"\n for resource_key in sorted(list(self.required_resource_keys)):\n yield OpDefinitionResourceRequirement(\n key=resource_key, node_description=node_description\n )\n for input_def in self.input_defs:\n if input_def.input_manager_key:\n yield InputManagerRequirement(\n key=input_def.input_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n elif asset_layer and handle:\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n if input_asset_key:\n io_manager_key = asset_layer.io_manager_key_for_asset(input_asset_key)\n yield InputManagerRequirement(\n key=io_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n\n for output_def in self.output_defs:\n yield OutputManagerRequirement(\n key=output_def.io_manager_key,\n node_description=node_description,\n output_name=output_def.name,\n )\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n return [input_handle]\n\n def __call__(self, *args, **kwargs) -> Any:\n from .composition import is_in_composition\n\n if is_in_composition():\n return super(OpDefinition, self).__call__(*args, **kwargs)\n\n return direct_invocation_result(self, *args, **kwargs)
\n\n\ndef _resolve_output_defs_from_outs(\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n outs: Optional[Mapping[str, Out]],\n default_code_version: Optional[str],\n) -> Sequence[OutputDefinition]:\n from .decorators.op_decorator import DecoratedOpFunction\n\n if isinstance(compute_fn, DecoratedOpFunction):\n inferred_output_props = infer_output_props(compute_fn.decorated_fn)\n annotation = inferred_output_props.annotation\n description = inferred_output_props.description\n else:\n inferred_output_props = None\n annotation = inspect.Parameter.empty\n description = None\n\n if outs is None:\n return [OutputDefinition.create_from_inferred(inferred_output_props, default_code_version)]\n\n # If only a single entry has been provided to the out dict, then slurp the\n # annotation into the entry.\n if len(outs) == 1:\n name = next(iter(outs.keys()))\n only_out = outs[name]\n return [only_out.to_definition(annotation, name, description, default_code_version)]\n\n output_defs: List[OutputDefinition] = []\n\n # Introspection on type annotations is experimental, so checking\n # metaclass is the best we can do.\n if annotation != inspect.Parameter.empty and not get_origin(annotation) == tuple:\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n )\n if annotation != inspect.Parameter.empty and not len(get_args(annotation)) == len(outs):\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation to have number of entries matching the "\n f"number of outputs for more than one output. Expected {len(outs)} "\n f"outputs but annotation has {len(get_args(annotation))}."\n )\n for idx, (name, cur_out) in enumerate(outs.items()):\n annotation_type = (\n get_args(annotation)[idx]\n if annotation != inspect.Parameter.empty\n else inspect.Parameter.empty\n )\n # Don't provide description when using multiple outputs. Introspection\n # is challenging when faced with multiple inputs.\n output_defs.append(\n cur_out.to_definition(\n annotation_type, name=name, description=None, code_version=default_code_version\n )\n )\n\n return output_defs\n\n\ndef _validate_context_type_hint(fn):\n from inspect import _empty as EmptyAnnotation\n\n from dagster._core.decorator_utils import get_function_params\n from dagster._core.definitions.decorators.op_decorator import is_context_provided\n from dagster._core.execution.context.compute import AssetExecutionContext, OpExecutionContext\n\n params = get_function_params(fn)\n if is_context_provided(params):\n if (\n params[0].annotation is not AssetExecutionContext\n and params[0].annotation is not OpExecutionContext\n and params[0].annotation is not EmptyAnnotation\n ):\n raise DagsterInvalidDefinitionError(\n f"Cannot annotate `context` parameter with type {params[0].annotation}. `context`"\n " must be annotated with AssetExecutionContext, OpExecutionContext, or left blank."\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/op_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.output

\nimport inspect\nfrom typing import (\n    Any,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataUserInput,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    is_dynamic_output_annotation,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, check_valid_name\n\nTOutputDefinition = TypeVar("TOutputDefinition", bound="OutputDefinition")\nTOut = TypeVar("TOut", bound="Out")\n\n\nclass OutputDefinition:\n    """Defines an output from an op's compute function.\n\n    Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n    Many ops have only one output, in which case the user can provide a single output definition\n    that will be given the default name, "result".\n\n    Output definitions may be typed using the Dagster type system.\n\n    Args:\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n            Users should provide the Python type of the objects that they expect the op to yield\n            for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n            want to be run on this output. Defaults to :py:class:`Any`.\n        name (Optional[str]): Name of the output. (default: "result")\n        description (Optional[str]): Human-readable description of the output.\n        is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n        io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n            output and loading it in downstream steps (default: "io_manager").\n        metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n            For example, users can provide a file path if the data object will be stored in a\n            filesystem, or provide information of a database table when it is going to load the data\n            into the table.\n        code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n            general, versions should be set only for code that deterministically produces the same\n            output when given the same inputs.\n\n    """\n\n    def __init__(\n        self,\n        dagster_type=None,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        is_required: bool = True,\n        io_manager_key: Optional[str] = None,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        code_version: Optional[str] = None,\n        # make sure new parameters are updated in combine_with_inferred below\n    ):\n        self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n        self._type_not_set = dagster_type is None\n        self._dagster_type = resolve_dagster_type(dagster_type)\n        self._description = check.opt_str_param(description, "description")\n        self._is_required = check.bool_param(is_required, "is_required")\n        self._io_manager_key = check.opt_str_param(\n            io_manager_key,\n            "io_manager_key",\n            default=DEFAULT_IO_MANAGER_KEY,\n        )\n        self._code_version = check.opt_str_param(code_version, "code_version")\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def is_required(self) -> bool:\n        return self._is_required\n\n    @property\n    def io_manager_key(self) -> str:\n        return self._io_manager_key\n\n    @property\n    def code_version(self) -> Optional[str]:\n        return self._code_version\n\n    @property\n    def optional(self) -> bool:\n        return not self.is_required\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_dynamic(self) -> bool:\n        return False\n\n    def mapping_from(\n        self, node_name: str, output_name: Optional[str] = None, from_dynamic_mapping: bool = False\n    ) -> "OutputMapping":\n        """Create an output mapping from an output of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`OutputMapping` from the output of a child node.\n\n        Args:\n            node_name (str): The name of the child node from which to map this output.\n            output_name (str): The name of the child node's output from which to map this output.\n\n        Examples:\n            .. code-block:: python\n\n                output_mapping = OutputDefinition(Int).mapping_from('child_node')\n        """\n        return OutputMapping(\n            graph_output_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_output_name=output_name or DEFAULT_OUTPUT,\n            graph_output_description=self.description,\n            dagster_type=self.dagster_type,\n            from_dynamic_mapping=from_dynamic_mapping or self.is_dynamic,\n        )\n\n    @staticmethod\n    def create_from_inferred(\n        inferred: Optional[InferredOutputProps], code_version: Optional[str] = None\n    ) -> "OutputDefinition":\n        if not inferred:\n            return OutputDefinition(code_version=code_version)\n        if is_dynamic_output_annotation(inferred.annotation):\n            return DynamicOutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n        else:\n            return OutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n\n    def combine_with_inferred(\n        self: TOutputDefinition, inferred: InferredOutputProps\n    ) -> TOutputDefinition:\n        dagster_type = self.dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred.annotation)\n        if self.description is None:\n            description = inferred.description\n        else:\n            description = self.description\n\n        return self.__class__(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            is_required=self.is_required,\n            io_manager_key=self.io_manager_key,\n            metadata=self._metadata,\n        )\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n    try:\n        if inferred == inspect.Parameter.empty:\n            return resolve_dagster_type(None)\n        elif inferred is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            return resolve_dagster_type(type(None))\n        else:\n            return resolve_dagster_type(inferred)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred}' from return type annotation, correct the issue "\n            "or explicitly set the dagster_type via Out()."\n        ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n    """Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n    output that will dynamically alter the graph at runtime.\n\n    When using in a composition function such as :py:func:`@job <dagster.job>`,\n    dynamic outputs must be used with either:\n\n    * ``map`` - clone downstream nodes for each separate :py:class:`DynamicOutput`\n    * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n    Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n        .. code-block:: python\n\n            @op(\n                config_schema={\n                    "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n                },\n                output_defs=[DynamicOutputDefinition(str)],\n            )\n            def files_in_directory(context):\n                path = context.op_config["path"]\n                dirname, _, filenames = next(os.walk(path))\n                for file in filenames:\n                    yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n            @job\n            def process_directory():\n                files = files_in_directory()\n\n                # use map to invoke an op on each dynamic output\n                file_results = files.map(process_file)\n\n                # use collect to gather the results in to a list\n                summarize_directory(file_results.collect())\n    """\n\n    @property\n    def is_dynamic(self) -> bool:\n        return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("node_name", str), ("output_name", str)])):\n    def __new__(cls, node_name: str, output_name: Optional[str] = None):\n        return super(OutputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the underlying op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass OutputMapping(NamedTuple):\n """Defines an output mapping for a graph.\n\n Args:\n graph_output_name (str): Name of the output in the graph being mapped to.\n mapped_node_name (str): Named of the node (op/graph) that the output is being mapped from.\n mapped_node_output_name (str): Name of the output in the node (op/graph) that is being mapped from.\n graph_output_description (Optional[str]): A description of the output in the graph being mapped from.\n from_dynamic_mapping (bool): Set to true if the node being mapped to is a mapped dynamic node.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's output being mapped to.\n\n Examples:\n .. code-block:: python\n\n from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n @op\n def emit_five(x):\n return 5\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[emit_five],\n output_mappings=[\n OutputMapping(\n graph_output_name="result", # Default output name\n mapped_node_name="emit_five",\n mapped_node_output_name="result"\n )\n ]\n )\n\n @graph(out=GraphOut())\n def the_graph:\n return emit_five()\n """\n\n graph_output_name: str\n mapped_node_name: str\n mapped_node_output_name: str\n graph_output_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n from_dynamic_mapping: bool = False\n\n @property\n def maps_from(self) -> OutputPointer:\n return OutputPointer(self.mapped_node_name, self.mapped_node_output_name)\n\n def get_definition(self, is_dynamic: bool) -> "OutputDefinition":\n check.invariant(not is_dynamic or self.from_dynamic_mapping)\n is_dynamic = is_dynamic or self.from_dynamic_mapping\n klass = DynamicOutputDefinition if is_dynamic else OutputDefinition\n return klass(\n name=self.graph_output_name,\n description=self.graph_output_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("io_manager_key", PublicAttr[str]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("code_version", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n code_version: Optional[str] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n return super(Out, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=metadata,\n code_version=code_version,\n )\n\n @classmethod\n def from_definition(cls, output_def: "OutputDefinition"):\n klass = Out if not output_def.is_dynamic else DynamicOut\n return klass(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n code_version=output_def.code_version,\n )\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n klass = OutputDefinition if not self.is_dynamic else DynamicOutputDefinition\n\n return klass(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return False
\n\n\n
[docs]class DynamicOut(Out):\n """Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return True
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", PublicAttr[Optional[str]])])):\n """Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.output"}, "partition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition

\nimport copy\nimport hashlib\nimport json\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import (\n    datetime,\n    timedelta,\n)\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dateutil.relativedelta import relativedelta\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, public\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.run_request import (\n    AddDynamicPartitionsRequest,\n    DeleteDynamicPartitionsRequest,\n)\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import xor\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import (\n    normalize_renamed_param,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom .config import ConfigMapping\nfrom .utils import validate_tags\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT_cov = TypeVar("T_cov", default=Any, covariant=True)\nT_str = TypeVar("T_str", bound=str, default=str, covariant=True)\nT_PartitionsDefinition = TypeVar(\n    "T_PartitionsDefinition",\n    bound="PartitionsDefinition",\n    default="PartitionsDefinition",\n    covariant=True,\n)\n\n# In the Dagster UI users can select partition ranges following the format '2022-01-13...2022-01-14'\n# "..." is an invalid substring in partition keys\n# The other escape characters are characters that may not display in the Dagster UI.\nINVALID_PARTITION_SUBSTRINGS = ["...", "\\a", "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", "\\0"]\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use string partition keys instead.")\nclass Partition(Generic[T_cov]):\n    """A Partition represents a single slice of the entire set of a job's possible work. It consists\n    of a value, which is an object that represents that partition, and an optional name, which is\n    used to label the partition in a human-readable way.\n\n    Args:\n        value (Any): The object for this partition\n        name (str): Name for this partition\n    """\n\n    def __init__(self, value: Any, name: Optional[str] = None):\n        self._value = value\n        self._name = check.str_param(name or str(value), "name")\n\n    @property\n    def value(self) -> T_cov:\n        return self._value\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, Partition):\n            return False\n        else:\n            return self.value == other.value and self.name == other.name\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n    HOURLY = "HOURLY"\n    DAILY = "DAILY"\n    WEEKLY = "WEEKLY"\n    MONTHLY = "MONTHLY"\n\n    @property\n    def ordinal(self):\n        return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n    @property\n    def delta(self):\n        if self == ScheduleType.HOURLY:\n            return timedelta(hours=1)\n        elif self == ScheduleType.DAILY:\n            return timedelta(days=1)\n        elif self == ScheduleType.WEEKLY:\n            return timedelta(weeks=1)\n        elif self == ScheduleType.MONTHLY:\n            return relativedelta(months=1)\n        else:\n            check.failed(f"Unexpected ScheduleType {self}")\n\n    def __gt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal > other.ordinal\n\n    def __lt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal < other.ordinal\n\n\n
[docs]class PartitionsDefinition(ABC, Generic[T_str]):\n """Defines a set of partitions, which can be attached to a software-defined asset or job.\n\n Abstract class with implementations for different kinds of partitions.\n """\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset[T_str]"]:\n return DefaultPartitionsSubset[T_str]\n\n
[docs] @abstractmethod\n @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n ...
\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[-1] if partition_keys else None\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[0] if partition_keys else None\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n keys_exist = {\n partition_key_range.start: self.has_partition_key(\n partition_key_range.start, dynamic_partitions_store=dynamic_partitions_store\n ),\n partition_key_range.end: self.has_partition_key(\n partition_key_range.end, dynamic_partitions_store=dynamic_partitions_store\n ),\n }\n if not all(keys_exist.values()):\n raise DagsterInvalidInvocationError(\n f"""Partition range {partition_key_range.start} to {partition_key_range.end} is\n not a valid range. Nonexistent partition keys:\n {list(key for key in keys_exist if keys_exist[key] is False)}"""\n )\n\n # in the simple case, simply return the single key in the range\n if partition_key_range.start == partition_key_range.end:\n return [cast(T_str, partition_key_range.start)]\n\n # defer this call as it is potentially expensive\n partition_keys = self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n return partition_keys[\n partition_keys.index(partition_key_range.start) : partition_keys.index(\n partition_key_range.end\n )\n + 1\n ]\n\n def empty_subset(self) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.empty_subset(self)\n\n def subset_with_partition_keys(\n self, partition_keys: Iterable[str]\n ) -> "PartitionsSubset[T_str]":\n return self.empty_subset().with_partition_keys(partition_keys)\n\n def subset_with_all_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.subset_with_partition_keys(\n self.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def deserialize_subset(self, serialized: str) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.from_serialized(self, serialized)\n\n def can_deserialize_subset(\n self,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n return self.partitions_subset_class.can_deserialize(\n self,\n serialized,\n serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name,\n )\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n json.dumps(\n self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n ).encode("utf-8")\n ).hexdigest()\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n tags = {PARTITION_NAME_TAG: partition_key}\n return tags\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n return len(self.get_partition_keys(current_time, dynamic_partitions_store))\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return partition_key in self.get_partition_keys(\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n def validate_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> None:\n if not self.has_partition_key(partition_key, current_time, dynamic_partitions_store):\n raise DagsterUnknownPartitionError(\n f"Could not find a partition with key `{partition_key}`."\n )
\n\n\ndef raise_error_on_invalid_partition_key_substring(partition_keys: Sequence[str]) -> None:\n for partition_key in partition_keys:\n found_invalid_substrs = [\n invalid_substr\n for invalid_substr in INVALID_PARTITION_SUBSTRINGS\n if invalid_substr in partition_key\n ]\n if found_invalid_substrs:\n raise DagsterInvalidDefinitionError(\n f"{found_invalid_substrs} are invalid substrings in a partition key"\n )\n\n\ndef raise_error_on_duplicate_partition_keys(partition_keys: Sequence[str]) -> None:\n counts: Dict[str, int] = defaultdict(lambda: 0)\n for partition_key in partition_keys:\n counts[partition_key] += 1\n found_duplicates = [key for key in counts.keys() if counts[key] > 1]\n if found_duplicates:\n raise DagsterInvalidDefinitionError(\n "Partition keys must be unique. Duplicate instances of partition keys:"\n f" {found_duplicates}."\n )\n\n\n
[docs]class StaticPartitionsDefinition(PartitionsDefinition[str]):\n """A statically-defined set of partitions.\n\n Example:\n .. code-block:: python\n\n from dagster import StaticPartitionsDefinition, asset\n\n oceans_partitions_def = StaticPartitionsDefinition(\n ["arctic", "atlantic", "indian", "pacific", "southern"]\n )\n\n @asset(partitions_def=oceans_partitions_defs)\n def ml_model_for_each_ocean():\n ...\n """\n\n def __init__(self, partition_keys: Sequence[str]):\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n\n raise_error_on_invalid_partition_key_substring(partition_keys)\n raise_error_on_duplicate_partition_keys(partition_keys)\n\n self._partition_keys = partition_keys\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Only applicable to\n DynamicPartitionsDefinitions.\n\n Returns:\n Sequence[str]\n\n """\n return self._partition_keys
\n\n def __hash__(self):\n return hash(self.__repr__())\n\n def __eq__(self, other) -> bool:\n return isinstance(other, StaticPartitionsDefinition) and (\n self is other or self._partition_keys == other.get_partition_keys()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={self._partition_keys})"\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # We don't currently throw an error when a duplicate partition key is defined\n # in a static partitions definition, though we will at 1.3.0.\n # This ensures that partition counts are correct in the Dagster UI.\n return len(set(self.get_partition_keys(current_time, dynamic_partitions_store)))
\n\n\nclass CachingDynamicPartitionsLoader(DynamicPartitionsStore):\n """A batch loader that caches the partition keys for a given dynamic partitions definition,\n to avoid repeated calls to the database for the same partitions definition.\n """\n\n def __init__(self, instance: DagsterInstance):\n self._instance = instance\n\n @cached_method\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n return self._instance.get_dynamic_partitions(partitions_def_name)\n\n @cached_method\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n return self._instance.has_dynamic_partition(partitions_def_name, partition_key)\n\n\n
[docs]@deprecated_param(\n param="partition_fn",\n breaking_version="2.0",\n additional_warn_text="Provide partition definition name instead.",\n)\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [\n (\n "partition_fn",\n PublicAttr[\n Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ]\n ],\n ),\n ("name", PublicAttr[Optional[str]]),\n ],\n ),\n):\n """A partitions definition whose partition keys can be dynamically added and removed.\n\n This is useful for cases where the set of partitions is not known at definition time,\n but is instead determined at runtime.\n\n Partitions can be added and removed using `instance.add_dynamic_partitions` and\n `instance.delete_dynamic_partition` methods.\n\n Args:\n name (Optional[str]): The name of the partitions definition.\n partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]):\n A function that returns the current set of partitions. This argument is deprecated and\n will be removed in 2.0.0.\n\n Examples:\n .. code-block:: python\n\n fruits = DynamicPartitionsDefinition(name="fruits")\n\n @sensor(job=my_job)\n def my_sensor(context):\n return SensorResult(\n run_requests=[RunRequest(partition_key="apple")],\n dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n )\n """\n\n def __new__(\n cls,\n partition_fn: Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ] = None,\n name: Optional[str] = None,\n ):\n partition_fn = check.opt_callable_param(partition_fn, "partition_fn")\n name = check.opt_str_param(name, "name")\n\n if partition_fn is None and name is None:\n raise DagsterInvalidDefinitionError(\n "Must provide either partition_fn or name to DynamicPartitionsDefinition."\n )\n\n if partition_fn and name:\n raise DagsterInvalidDefinitionError(\n "Cannot provide both partition_fn and name to DynamicPartitionsDefinition."\n )\n\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls,\n partition_fn=check.opt_callable_param(partition_fn, "partition_fn"),\n name=check.opt_str_param(name, "name"),\n )\n\n def _validated_name(self) -> str:\n if self.name is None:\n check.failed(\n "Dynamic partitions definition must have a name to fetch dynamic partitions"\n )\n return self.name\n\n def __eq__(self, other):\n return (\n isinstance(other, DynamicPartitionsDefinition)\n and self.name == other.name\n and self.partition_fn == other.partition_fn\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n def __str__(self) -> str:\n if self.name:\n return f'Dynamic partitions: "{self._validated_name()}"'\n else:\n return super().__str__()\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n if self.partition_fn:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return [partition.name for partition in partitions] # type: ignore # (illegible conditional)\n else:\n return partitions # type: ignore # (illegible conditional)\n else:\n check.opt_inst_param(\n dynamic_partitions_store, "dynamic_partitions_store", DynamicPartitionsStore\n )\n\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=self._validated_name()\n )
\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n if self.partition_fn:\n return partition_key in self.get_partition_keys(current_time)\n else:\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.has_dynamic_partition(\n partitions_def_name=self._validated_name(), partition_key=partition_key\n )\n\n def build_add_request(self, partition_keys: Sequence[str]) -> AddDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return AddDynamicPartitionsRequest(validated_name, partition_keys)\n\n def build_delete_request(self, partition_keys: Sequence[str]) -> DeleteDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return DeleteDynamicPartitionsRequest(validated_name, partition_keys)
\n\n\n
[docs]@deprecated_param(\n param="run_config_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n)\n@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `tags_for_partition_key_fn` instead.",\n)\nclass PartitionedConfig(Generic[T_PartitionsDefinition]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: T_PartitionsDefinition,\n run_config_for_partition_fn: Optional[Callable[[Partition], Mapping[str, Any]]] = None,\n decorated_fn: Optional[Callable[..., Mapping[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[Any]], Mapping[str, str]]] = None,\n run_config_for_partition_key_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._decorated_fn = decorated_fn\n\n check.invariant(\n xor(run_config_for_partition_fn, run_config_for_partition_key_fn),\n "Must provide exactly one of run_config_for_partition_fn or"\n " run_config_for_partition_key_fn",\n )\n check.invariant(\n not (tags_for_partition_fn and tags_for_partition_key_fn),\n "Cannot provide both of tags_for_partition_fn or tags_for_partition_key_fn",\n )\n\n self._run_config_for_partition_fn = check.opt_callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._run_config_for_partition_key_fn = check.opt_callable_param(\n run_config_for_partition_key_fn, "run_config_for_partition_key_fn"\n )\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n self._tags_for_partition_key_fn = check.opt_callable_param(\n tags_for_partition_key_fn, "tags_for_partition_key_fn"\n )\n\n @public\n @property\n def partitions_def(\n self,\n ) -> T_PartitionsDefinition:\n """T_PartitionsDefinition: The partitions definition associated with this PartitionedConfig."""\n return self._partitions\n\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n )\n @public\n @property\n def run_config_for_partition_fn(\n self,\n ) -> Optional[Callable[[Partition], Mapping[str, Any]]]:\n """Optional[Callable[[Partition], Mapping[str, Any]]]: A function that accepts a partition\n and returns a dictionary representing the config to attach to runs for that partition.\n Deprecated as of 1.3.3.\n """\n return self._run_config_for_partition_fn\n\n @public\n @property\n def run_config_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, Any]]]:\n """Optional[Callable[[str], Mapping[str, Any]]]: A function that accepts a partition key\n and returns a dictionary representing the config to attach to runs for that partition.\n """\n\n @deprecated(\n breaking_version="2.0", additional_warn_text="Use `tags_for_partition_key_fn` instead."\n )\n @public\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition], Mapping[str, str]]]:\n """Optional[Callable[[Partition], Mapping[str, str]]]: A function that\n accepts a partition and returns a dictionary of tags to attach to runs for\n that partition. Deprecated as of 1.3.3.\n """\n return self._tags_for_partition_fn\n\n @public\n @property\n def tags_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, str]]]:\n """Optional[Callable[[str], Mapping[str, str]]]: A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for\n that partition.\n """\n return self._tags_for_partition_key_fn\n\n
[docs] @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Sequence[str]:\n """Returns a list of partition keys, representing the full set of partitions that\n config can be applied to.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time. Only\n applicable to time-based partitions definitions.\n\n Returns:\n Sequence[str]\n """\n return self.partitions_def.get_partition_keys(current_time)
\n\n # Assumes partition key already validated\n def get_run_config_for_partition_key(\n self,\n partition_key: str,\n ) -> Mapping[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n # _run_config_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._run_config_for_partition_fn:\n run_config = self._run_config_for_partition_fn(Partition(partition_key))\n elif self._run_config_for_partition_key_fn:\n run_config = self._run_config_for_partition_key_fn(partition_key)\n else:\n check.failed("Unreachable.") # one of the above funcs always defined\n return copy.deepcopy(run_config)\n\n # Assumes partition key already validated\n def get_tags_for_partition_key(\n self,\n partition_key: str,\n job_name: Optional[str] = None,\n ) -> Mapping[str, str]:\n from dagster._core.host_representation.external_data import (\n external_partition_set_name_for_job_name,\n )\n\n # _tags_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._tags_for_partition_fn:\n user_tags = self._tags_for_partition_fn(Partition(partition_key))\n elif self._tags_for_partition_key_fn:\n user_tags = self._tags_for_partition_key_fn(partition_key)\n else:\n user_tags = {}\n user_tags = validate_tags(user_tags, allow_reserved_tags=False)\n\n system_tags = {\n **self.partitions_def.get_tags_for_partition_key(partition_key),\n **(\n # `PartitionSetDefinition` has been deleted but we still need to attach this special tag in\n # order for reexecution against partitions to work properly.\n {PARTITION_SET_TAG: external_partition_set_name_for_job_name(job_name)}\n if job_name\n else {}\n ),\n }\n\n return {**user_tags, **system_tags}\n\n @classmethod\n def from_flexible_config(\n cls,\n config: Optional[Union[ConfigMapping, Mapping[str, object], "PartitionedConfig"]],\n partitions_def: PartitionsDefinition,\n ) -> "PartitionedConfig":\n check.invariant(\n not isinstance(config, ConfigMapping),\n "Can't supply a ConfigMapping for 'config' when 'partitions_def' is supplied.",\n )\n\n if isinstance(config, PartitionedConfig):\n check.invariant(\n config.partitions_def == partitions_def,\n "Can't supply a PartitionedConfig for 'config' with a different "\n "PartitionsDefinition than supplied for 'partitions_def'.",\n )\n return config\n else:\n hardcoded_config = config if config else {}\n return cls(\n partitions_def,\n run_config_for_partition_key_fn=lambda _: cast(Mapping, hardcoded_config),\n )\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef static_partitioned_config(\n partition_keys: Sequence[str],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig[StaticPartitionsDefinition]]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys is a static list of strings identifying the set of partitions. The\n list of partitions is static, so while the run config returned by the decorated function may\n change over time, the list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in the Dagster UI.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (Sequence[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.sequence_param(partition_keys, "partition_keys", str)\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(\n fn: Callable[[str], Mapping[str, Any]]\n ) -> PartitionedConfig[StaticPartitionsDefinition]:\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef partitioned_config(\n partitions_def: PartitionsDefinition,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a partitioned config for a job given a PartitionsDefinition.\n\n The partitions_def provides the set of partitions, which may change over time\n (for example, when using a DynamicPartitionsDefinition).\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partitions_def: (Optional[DynamicPartitionsDefinition]): PartitionsDefinition for the job\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.opt_callable_param(tags_for_partition_key_fn, "tags_for_partition_key_fn")\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=partitions_def,\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], Sequence[str]],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef cron_schedule_from_schedule_type_and_offsets(\n schedule_type: ScheduleType,\n minute_offset: int,\n hour_offset: int,\n day_offset: Optional[int],\n) -> str:\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute_offset} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute_offset} {hour_offset} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute_offset} {hour_offset} * * {day_offset if day_offset is not None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute_offset} {hour_offset} {day_offset if day_offset is not None else 1} * *"\n else:\n check.assert_never(schedule_type)\n\n\nclass PartitionsSubset(ABC, Generic[T_str]):\n """Represents a subset of the partitions within a PartitionsDefinition."""\n\n @abstractmethod\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[T_str]: ...\n\n @abstractmethod\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[T_str]: ...\n\n @abstractmethod\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]: ...\n\n @abstractmethod\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset[T_str]": ...\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.with_partition_keys(\n self.partitions_def.get_partition_keys_in_range(\n partition_key_range, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def __or__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.with_partition_keys(other.get_partition_keys())\n\n def __sub__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self.partitions_def.empty_subset()\n return self.partitions_def.empty_subset().with_partition_keys(\n set(self.get_partition_keys()).difference(set(other.get_partition_keys()))\n )\n\n def __and__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.partitions_def.empty_subset().with_partition_keys(\n set(self.get_partition_keys()) & set(other.get_partition_keys())\n )\n\n @abstractmethod\n def serialize(self) -> str: ...\n\n @classmethod\n @abstractmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]": ...\n\n @classmethod\n @abstractmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool: ...\n\n @property\n @abstractmethod\n def partitions_def(self) -> PartitionsDefinition[T_str]: ...\n\n @abstractmethod\n def __len__(self) -> int: ...\n\n @abstractmethod\n def __contains__(self, value) -> bool: ...\n\n @classmethod\n @abstractmethod\n def empty_subset(\n cls, partitions_def: PartitionsDefinition[T_str]\n ) -> "PartitionsSubset[T_str]": ...\n\n\n@whitelist_for_serdes\nclass SerializedPartitionsSubset(NamedTuple):\n serialized_subset: str\n serialized_partitions_def_unique_id: str\n serialized_partitions_def_class_name: str\n\n @classmethod\n def from_subset(\n cls,\n subset: PartitionsSubset,\n partitions_def: PartitionsDefinition,\n dynamic_partitions_store: DynamicPartitionsStore,\n ):\n return cls(\n serialized_subset=subset.serialize(),\n serialized_partitions_def_unique_id=partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n ),\n serialized_partitions_def_class_name=partitions_def.__class__.__name__,\n )\n\n def can_deserialize(self, partitions_def: Optional[PartitionsDefinition]) -> bool:\n if not partitions_def:\n # Asset had a partitions definition at storage time, but no longer does\n return False\n\n return partitions_def.can_deserialize_subset(\n self.serialized_subset,\n serialized_partitions_def_unique_id=self.serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name=self.serialized_partitions_def_class_name,\n )\n\n def deserialize(self, partitions_def: PartitionsDefinition) -> PartitionsSubset:\n return partitions_def.deserialize_subset(self.serialized_subset)\n\n\nclass DefaultPartitionsSubset(PartitionsSubset[T_str]):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self, partitions_def: PartitionsDefinition[T_str], subset: Optional[Set[T_str]] = None\n ):\n check.opt_set_param(subset, "subset")\n self._partitions_def = partitions_def\n self._subset = subset or set()\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n return (\n set(\n self._partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n - self._subset\n )\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n return self._subset\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n partition_keys = self._partitions_def.get_partition_keys(\n current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n cur_range_start = None\n cur_range_end = None\n result = []\n for partition_key in partition_keys:\n if partition_key in self._subset:\n if cur_range_start is None:\n cur_range_start = partition_key\n cur_range_end = partition_key\n else:\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n cur_range_start = cur_range_end = None\n\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n\n return result\n\n def with_partition_keys(\n self, partition_keys: Iterable[T_str]\n ) -> "DefaultPartitionsSubset[T_str]":\n return DefaultPartitionsSubset(\n self._partitions_def,\n self._subset | set(partition_keys),\n )\n\n def serialize(self) -> str:\n # Serialize version number, so attempting to deserialize old versions can be handled gracefully.\n # Any time the serialization format changes, we should increment the version number.\n return json.dumps({"version": self.SERIALIZATION_VERSION, "subset": list(self._subset)})\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]":\n # Check the version number, so only valid versions can be deserialized.\n data = json.loads(serialized)\n\n if isinstance(data, list):\n # backwards compatibility\n return cls(subset=set(data), partitions_def=partitions_def)\n else:\n if data.get("version") != cls.SERIALIZATION_VERSION:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {data.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n return cls(subset=set(data.get("subset")), partitions_def=partitions_def)\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition[T_str],\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_class_name is not None:\n return serialized_partitions_def_class_name == partitions_def.__class__.__name__\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n data.get("subset") is not None and data.get("version") == cls.SERIALIZATION_VERSION\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T_str]:\n return self._partitions_def\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DefaultPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and self._subset == other._subset\n )\n\n def __len__(self) -> int:\n return len(self._subset)\n\n def __contains__(self, value) -> bool:\n return value in self._subset\n\n def __repr__(self) -> str:\n return (\n f"DefaultPartitionsSubset(subset={self._subset}, partitions_def={self._partitions_def})"\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition[T_str]) -> "PartitionsSubset[T_str]":\n return cls(partitions_def=partitions_def)\n
", "current_page_name": "_modules/dagster/_core/definitions/partition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition"}, "partition_key_range": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_key_range

\nfrom typing import NamedTuple\n\nfrom dagster._annotations import PublicAttr\n\n\n
[docs]class PartitionKeyRange(NamedTuple):\n """Defines a range of partitions.\n\n Attributes:\n start (str): The starting partition key in the range (inclusive).\n end (str): The ending partition key in the range (inclusive).\n\n Examples:\n .. code-block:: python\n\n partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\n partition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n """\n\n # Inclusive on both sides\n start: PublicAttr[str]\n end: PublicAttr[str]
\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_key_range", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_key_range"}, "partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_mapping

\nimport collections.abc\nimport itertools\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Collection,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._core.definitions.multi_dimensional_partitions import (\n    MultiPartitionKey,\n    MultiPartitionsDefinition,\n)\nfrom dagster._core.definitions.partition import (\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n\nclass UpstreamPartitionsResult(NamedTuple):\n    """Represents the result of mapping a PartitionsSubset to the corresponding\n    partitions in another PartitionsDefinition.\n\n    partitions_subset (PartitionsSubset): The resulting partitions subset that was\n        mapped to. Only contains partitions for existent partitions, filtering out nonexistent partitions.\n    required_but_nonexistent_partition_keys (Sequence[str]): A list containing invalid partition keys in to_partitions_def\n        that partitions in from_partitions_subset were mapped to.\n    """\n\n    partitions_subset: PartitionsSubset\n    required_but_nonexistent_partition_keys: Sequence[str]\n\n\n
[docs]class PartitionMapping(ABC):\n """Defines a correspondence between the partitions in an asset and the partitions in an asset\n that it depends on.\n\n Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\n class may change at any time.\n """\n\n
[docs] @public\n @abstractmethod\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the subset of partition keys in the downstream asset that use the data in the given\n partition key subset of the upstream asset.\n\n Args:\n upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]): The\n subset of partition keys in the upstream asset.\n downstream_partitions_def (PartitionsDefinition): The partitions definition for the\n downstream asset.\n """
\n\n
[docs] @public\n @abstractmethod\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n """Returns a UpstreamPartitionsResult object containing the partition keys the downstream\n partitions subset was mapped to in the upstream partitions definition.\n\n Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\n Invalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.\n\n For example, if an upstream asset is time-partitioned and starts in June 2023, and the\n downstream asset is time-partitioned and starts in May 2023, this function would return a\n UpstreamPartitionsResult(PartitionsSubset("2023-06-01"), required_but_nonexistent_partition_keys=["2023-05-01"])\n when downstream_partitions_subset contains 2023-05-01 and 2023-06-01.\n """
\n\n\n
[docs]@whitelist_for_serdes\nclass IdentityPartitionMapping(PartitionMapping, NamedTuple("_IdentityPartitionMapping", [])):\n """Expects that the upstream and downstream assets are partitioned in the same way, and maps\n partitions in the downstream asset to the same partition in the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n if downstream_partitions_subset.partitions_def == upstream_partitions_def:\n return UpstreamPartitionsResult(downstream_partitions_subset, [])\n\n upstream_partition_keys = set(\n upstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n downstream_partition_keys = set(downstream_partitions_subset.get_partition_keys())\n\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(\n list(upstream_partition_keys & downstream_partition_keys)\n ),\n list(downstream_partition_keys - upstream_partition_keys),\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n if upstream_partitions_subset.partitions_def == downstream_partitions_def:\n return upstream_partitions_subset\n\n upstream_partition_keys = set(upstream_partitions_subset.get_partition_keys())\n downstream_partition_keys = set(\n downstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n return downstream_partitions_def.empty_subset().with_partition_keys(\n list(downstream_partition_keys & upstream_partition_keys)\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AllPartitionMapping(PartitionMapping, NamedTuple("_AllPartitionMapping", [])):\n """Maps every partition in the downstream asset to every partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on all partitions of the usptream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n upstream_subset = upstream_partitions_def.subset_with_all_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass LastPartitionMapping(PartitionMapping, NamedTuple("_LastPartitionMapping", [])):\n """Maps all dependencies to the last partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on the last partition of the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n last = upstream_partitions_def.get_last_partition_key(\n current_time=None, dynamic_partitions_store=dynamic_partitions_store\n )\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if last is not None:\n upstream_subset = upstream_subset.with_partition_keys([last])\n\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass SpecificPartitionsPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_SpecificPartitionsPartitionMapping", [("partition_keys", PublicAttr[Sequence[str]])]\n ),\n):\n """Maps to a specific subset of partitions in the upstream asset.\n\n Example:\n .. code-block:: python\n\n from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\n def upstream():\n ...\n\n @asset(\n ins={\n "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n }\n )\n def a_downstream(upstream):\n ...\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(self.partition_keys), []\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n # if any of the partition keys in this partition mapping are contained within the upstream\n # partitions subset, then all partitions of the downstream asset are dependencies\n if any(key in upstream_partitions_subset for key in self.partition_keys):\n return downstream_partitions_def.subset_with_all_partitions(\n dynamic_partitions_store=dynamic_partitions_store\n )\n return downstream_partitions_def.empty_subset()
\n\n\nclass DimensionDependency(NamedTuple):\n partition_mapping: PartitionMapping\n upstream_dimension_name: Optional[str] = None\n downstream_dimension_name: Optional[str] = None\n\n\nclass BaseMultiPartitionMapping(ABC):\n @abstractmethod\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]: ...\n\n def get_partitions_def(\n self, partitions_def: PartitionsDefinition, dimension_name: Optional[str]\n ) -> PartitionsDefinition:\n if isinstance(partitions_def, MultiPartitionsDefinition):\n if not isinstance(dimension_name, str):\n check.failed("Expected dimension_name to be a string")\n return partitions_def.get_partitions_def_for_dimension(dimension_name)\n return partitions_def\n\n def _get_dependency_partitions_subset(\n self,\n a_partitions_def: PartitionsDefinition,\n a_partitions_subset: PartitionsSubset,\n b_partitions_def: PartitionsDefinition,\n a_upstream_of_b: bool,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Union[UpstreamPartitionsResult, PartitionsSubset]:\n """Given two partitions definitions a_partitions_def and b_partitions_def that have a dependency\n relationship (a_upstream_of_b is True if a_partitions_def is upstream of b_partitions_def),\n and a_partition_keys, a list of partition keys in a_partitions_def, returns a list of\n partition keys in the partitions definition b_partitions_def that are\n dependencies of the partition keys in a_partition_keys.\n """\n a_partition_keys_by_dimension = defaultdict(set)\n if isinstance(a_partitions_def, MultiPartitionsDefinition):\n for partition_key in a_partitions_subset.get_partition_keys():\n for dimension_name, key in cast(\n MultiPartitionKey, partition_key\n ).keys_by_dimension.items():\n a_partition_keys_by_dimension[dimension_name].add(key)\n else:\n for partition_key in a_partitions_subset.get_partition_keys():\n a_partition_keys_by_dimension[None].add(partition_key)\n\n # Maps the dimension name and key of a partition in a_partitions_def to the list of\n # partition keys in b_partitions_def that are dependencies of that partition\n dep_b_keys_by_a_dim_and_key: Dict[Optional[str], Dict[Optional[str], List[str]]] = (\n defaultdict(lambda: defaultdict(list))\n )\n required_but_nonexistent_upstream_partitions = set()\n\n b_dimension_partitions_def_by_name: Dict[Optional[str], PartitionsDefinition] = (\n {\n dimension.name: dimension.partitions_def\n for dimension in b_partitions_def.partitions_defs\n }\n if isinstance(b_partitions_def, MultiPartitionsDefinition)\n else {None: b_partitions_def}\n )\n\n if a_upstream_of_b:\n # a_partitions_def is upstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependent dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.upstream_dimension_name: (\n dimension_mapping.downstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n a_partitions_def, b_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n dimension_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n # if downstream dimension mapping exists, for a given key, get the list of\n # downstream partition keys that are dependencies of that key\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n dimension_mapping.get_downstream_partitions_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ).get_partition_keys()\n )\n\n else:\n # a_partitions_def is downstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependency dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.downstream_dimension_name: (\n dimension_mapping.upstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n b_partitions_def, a_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n partition_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n )\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n mapped_partitions_result.partitions_subset.get_partition_keys()\n )\n required_but_nonexistent_upstream_partitions.update(\n set(mapped_partitions_result.required_but_nonexistent_partition_keys)\n )\n\n b_partition_keys = set()\n\n mapped_a_dim_names = a_dim_to_dependency_b_dim.keys()\n mapped_b_dim_names = [mapping[0] for mapping in a_dim_to_dependency_b_dim.values()]\n unmapped_b_dim_names = list(\n set(b_dimension_partitions_def_by_name.keys()) - set(mapped_b_dim_names)\n )\n\n for key in a_partitions_subset.get_partition_keys():\n for b_key_values in itertools.product(\n *(\n [\n dep_b_keys_by_a_dim_and_key[dim_name][\n (\n cast(MultiPartitionKey, key).keys_by_dimension[dim_name]\n if dim_name\n else key\n )\n ]\n for dim_name in mapped_a_dim_names\n ]\n ),\n *[\n b_dimension_partitions_def_by_name[dim_name].get_partition_keys()\n for dim_name in unmapped_b_dim_names\n ],\n ):\n b_partition_keys.add(\n MultiPartitionKey(\n {\n cast(str, (mapped_b_dim_names + unmapped_b_dim_names)[i]): key\n for i, key in enumerate(b_key_values)\n }\n )\n if len(b_key_values) > 1\n else b_key_values[0]\n )\n\n mapped_subset = b_partitions_def.empty_subset().with_partition_keys(b_partition_keys)\n if a_upstream_of_b:\n return mapped_subset\n else:\n return UpstreamPartitionsResult(\n mapped_subset,\n required_but_nonexistent_partition_keys=list(\n required_but_nonexistent_upstream_partitions\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, downstream_partitions_subset.partitions_def),\n downstream_partitions_subset,\n cast(MultiPartitionsDefinition, upstream_partitions_def),\n a_upstream_of_b=False,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n if not isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected UpstreamPartitionsResult")\n\n return result\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, upstream_partitions_subset.partitions_def),\n upstream_partitions_subset,\n cast(MultiPartitionsDefinition, downstream_partitions_def),\n a_upstream_of_b=True,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n if isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected PartitionsSubset")\n\n return result\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiToSingleDimensionPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiToSingleDimensionPartitionMapping", [("partition_dimension_name", Optional[str])]\n ),\n):\n """Defines a correspondence between an single-dimensional partitions definition\n and a MultiPartitionsDefinition. The single-dimensional partitions definition must be\n a dimension of the MultiPartitionsDefinition.\n\n This class handles the case where the upstream asset is multipartitioned and the\n downstream asset is single dimensional, and vice versa.\n\n For a partition key X, this partition mapping assumes that any multi-partition key with\n X in the selected dimension is a dependency.\n\n Args:\n partition_dimension_name (Optional[str]): The name of the partition dimension in the\n MultiPartitionsDefinition that matches the single-dimension partitions definition.\n """\n\n def __new__(cls, partition_dimension_name: Optional[str] = None):\n return super(MultiToSingleDimensionPartitionMapping, cls).__new__(\n cls,\n partition_dimension_name=check.opt_str_param(\n partition_dimension_name, "partition_dimension_name"\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n infer_mapping_result = _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n )\n\n if not infer_mapping_result.can_infer:\n check.invariant(isinstance(infer_mapping_result.inference_failure_reason, str))\n check.failed(cast(str, infer_mapping_result.inference_failure_reason))\n\n return [cast(DimensionDependency, infer_mapping_result.dimension_dependency)]
\n\n\n@whitelist_for_serdes\nclass DimensionPartitionMapping(\n NamedTuple(\n "_DimensionPartitionMapping",\n [\n ("dimension_name", str),\n ("partition_mapping", PartitionMapping),\n ],\n )\n):\n """A helper class for MultiPartitionMapping that defines a partition mapping used to calculate\n the dependent partition keys in the selected downstream MultiPartitions definition dimension.\n\n Args:\n dimension_name (str): The name of the dimension in the downstream MultiPartitionsDefinition.\n partition_mapping (PartitionMapping): The partition mapping object used to calculate\n the downstream dimension partitions from the upstream dimension partitions and vice versa.\n """\n\n def __new__(\n cls,\n dimension_name: str,\n partition_mapping: PartitionMapping,\n ):\n return super(DimensionPartitionMapping, cls).__new__(\n cls,\n dimension_name=check.str_param(dimension_name, "dimension_name"),\n partition_mapping=check.inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n )\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiPartitionMapping",\n [("downstream_mappings_by_upstream_dimension", Mapping[str, DimensionPartitionMapping])],\n ),\n):\n """Defines a correspondence between two MultiPartitionsDefinitions.\n\n Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\n the explicit correspondence between the upstream and downstream MultiPartitions dimensions\n and the partition mapping used to calculate the downstream partitions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "abc": DimensionPartitionMapping(\n dimension_name="123",\n partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n ),\n "weekly": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=TimeWindowPartitionMapping(),\n )\n }\n )\n\n For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\n assume an `AllPartitionsMapping`, meaning that all upstream partitions in those dimensions\n will be mapped to all downstream partitions in those dimensions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "daily": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=IdentityPartitionMapping(),\n )\n }\n )\n\n # Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n # {"abc": "a", "daily": "2023-01-01"}\n # {"abc": "b", "daily": "2023-01-01"}\n # {"abc": "c", "daily": "2023-01-01"}\n\n Args:\n downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]): A\n mapping that defines an explicit correspondence between one dimension of the upstream\n MultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\n Maps a string representing upstream dimension name to downstream DimensionPartitionMapping,\n containing the downstream dimension name and partition mapping.\n """\n\n def __new__(\n cls, downstream_mappings_by_upstream_dimension: Mapping[str, DimensionPartitionMapping]\n ):\n return super(MultiPartitionMapping, cls).__new__(\n cls,\n downstream_mappings_by_upstream_dimension=check.mapping_param(\n downstream_mappings_by_upstream_dimension,\n "downstream_mappings_by_upstream_dimension",\n key_type=str,\n value_type=DimensionPartitionMapping,\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n self._check_all_dimensions_accounted_for(\n upstream_partitions_def,\n downstream_partitions_def,\n )\n\n return [\n DimensionDependency(\n mapping.partition_mapping,\n upstream_dimension_name=upstream_dimension,\n downstream_dimension_name=mapping.dimension_name,\n )\n for upstream_dimension, mapping in self.downstream_mappings_by_upstream_dimension.items()\n ]\n\n def _check_all_dimensions_accounted_for(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> None:\n if any(\n not isinstance(partitions_def, MultiPartitionsDefinition)\n for partitions_def in (upstream_partitions_def, downstream_partitions_def)\n ):\n check.failed(\n "Both partitions defs provided to a MultiPartitionMapping must be multi-partitioned"\n )\n\n upstream_dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, upstream_partitions_def).partitions_defs\n }\n dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, downstream_partitions_def).partitions_defs\n }\n\n for (\n upstream_dimension_name,\n dimension_mapping,\n ) in self.downstream_mappings_by_upstream_dimension.items():\n if upstream_dimension_name not in upstream_dimension_names:\n check.failed(\n "Dimension mapping has an upstream dimension name that is not in the upstream "\n "partitions def"\n )\n if dimension_mapping.dimension_name not in dimension_names:\n check.failed(\n "Dimension mapping has a downstream dimension name that is not in the"\n " downstream partitions def"\n )\n\n upstream_dimension_names.remove(upstream_dimension_name)\n dimension_names.remove(dimension_mapping.dimension_name)
\n\n\n
[docs]@whitelist_for_serdes\nclass StaticPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_StaticPartitionMapping",\n [\n (\n "downstream_partition_keys_by_upstream_partition_key",\n PublicAttr[Mapping[str, Union[str, Collection[str]]]],\n )\n ],\n ),\n):\n """Define an explicit correspondence between two StaticPartitionsDefinitions.\n\n Args:\n downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]):\n The single or multi-valued correspondence from upstream keys to downstream keys.\n """\n\n def __init__(\n self,\n downstream_partition_keys_by_upstream_partition_key: Mapping[\n str, Union[str, Collection[str]]\n ],\n ):\n check.mapping_param(\n downstream_partition_keys_by_upstream_partition_key,\n "downstream_partition_keys_by_upstream_partition_key",\n key_type=str,\n value_type=(str, collections.abc.Collection),\n )\n\n # cache forward and reverse mappings\n self._mapping = defaultdict(set)\n for (\n upstream_key,\n downstream_keys,\n ) in downstream_partition_keys_by_upstream_partition_key.items():\n self._mapping[upstream_key] = (\n {downstream_keys} if isinstance(downstream_keys, str) else set(downstream_keys)\n )\n\n self._inverse_mapping = defaultdict(set)\n for upstream_key, downstream_keys in self._mapping.items():\n for downstream_key in downstream_keys:\n self._inverse_mapping[downstream_key].add(upstream_key)\n\n @cached_method\n def _check_upstream(self, *, upstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream is only defined on upstream keys."""\n check.inst(\n upstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n upstream_keys = upstream_partitions_def.get_partition_keys()\n extra_keys = set(self._mapping.keys()).difference(upstream_keys)\n if extra_keys:\n raise ValueError(\n f"mapping source partitions not in the upstream partitions definition: {extra_keys}"\n )\n\n @cached_method\n def _check_downstream(self, *, downstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream only maps to downstream keys."""\n check.inst(\n downstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n downstream_keys = downstream_partitions_def.get_partition_keys()\n extra_keys = set(self._inverse_mapping.keys()).difference(downstream_keys)\n if extra_keys:\n raise ValueError(\n "mapping target partitions not in the downstream partitions definition:"\n f" {extra_keys}"\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n self._check_downstream(downstream_partitions_def=downstream_partitions_def)\n\n downstream_subset = downstream_partitions_def.empty_subset()\n downstream_keys = set()\n for key in upstream_partitions_subset.get_partition_keys():\n downstream_keys.update(self._mapping[key])\n return downstream_subset.with_partition_keys(downstream_keys)\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n self._check_upstream(upstream_partitions_def=upstream_partitions_def)\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if downstream_partitions_subset is None:\n return UpstreamPartitionsResult(upstream_subset, [])\n\n upstream_keys = set()\n for key in downstream_partitions_subset.get_partition_keys():\n upstream_keys.update(self._inverse_mapping[key])\n\n return UpstreamPartitionsResult(upstream_subset.with_partition_keys(upstream_keys), [])
\n\n\nclass InferSingleToMultiDimensionDepsResult(\n NamedTuple(\n "_InferSingleToMultiDimensionDepsResult",\n [\n ("can_infer", bool),\n ("inference_failure_reason", Optional[str]),\n ("dimension_dependency", Optional[DimensionDependency]),\n ],\n )\n):\n def __new__(\n cls,\n can_infer: bool,\n inference_failure_reason: Optional[str] = None,\n dimension_dependency: Optional[DimensionDependency] = None,\n ):\n if can_infer and dimension_dependency is None:\n check.failed("dimension_dependency must be provided if can_infer is True")\n if not can_infer and inference_failure_reason is None:\n check.failed("inference_failure_reason must be provided if can_infer is False")\n\n return super(InferSingleToMultiDimensionDepsResult, cls).__new__(\n cls,\n can_infer,\n inference_failure_reason,\n dimension_dependency,\n )\n\n\ndef _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n partition_dimension_name: Optional[str] = None,\n) -> InferSingleToMultiDimensionDepsResult:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n upstream_is_multipartitioned = isinstance(upstream_partitions_def, MultiPartitionsDefinition)\n\n multipartitions_defs = [\n partitions_def\n for partitions_def in [upstream_partitions_def, downstream_partitions_def]\n if isinstance(partitions_def, MultiPartitionsDefinition)\n ]\n if len(multipartitions_defs) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "Can only use MultiToSingleDimensionPartitionMapping when upstream asset is"\n " multipartitioned and the downstream asset is single dimensional, or vice versa."\n f" Instead received {len(multipartitions_defs)} multi-partitioned assets.",\n )\n\n multipartitions_def = cast(MultiPartitionsDefinition, next(iter(multipartitions_defs)))\n\n single_dimension_partitions_def = next(\n iter(\n {\n upstream_partitions_def,\n downstream_partitions_def,\n }\n - set(multipartitions_defs)\n )\n )\n\n filtered_multipartition_dims = (\n multipartitions_def.partitions_defs\n if partition_dimension_name is None\n else [\n dim\n for dim in multipartitions_def.partitions_defs\n if dim.name == partition_dimension_name\n ]\n )\n\n if partition_dimension_name:\n if len(filtered_multipartition_dims) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n f"Provided partition dimension name {partition_dimension_name} not found in"\n f" multipartitions definition {multipartitions_def}.",\n )\n\n matching_dimension_defs = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if dimension_def.partitions_def == single_dimension_partitions_def\n ]\n\n if len(matching_dimension_defs) == 1:\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n IdentityPartitionMapping(),\n upstream_dimension_name=(\n matching_dimension_defs[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n matching_dimension_defs[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n elif len(matching_dimension_defs) > 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "partition dimension name must be specified when multiple dimensions of the"\n " MultiPartitionsDefinition match the single dimension partitions def",\n )\n\n time_dimensions = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if isinstance(dimension_def.partitions_def, TimeWindowPartitionsDefinition)\n ]\n\n if len(time_dimensions) == 1 and isinstance(\n single_dimension_partitions_def, TimeWindowPartitionsDefinition\n ):\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n TimeWindowPartitionMapping(),\n upstream_dimension_name=(\n time_dimensions[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n time_dimensions[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n\n return InferSingleToMultiDimensionDepsResult(\n False,\n "MultiToSingleDimensionPartitionMapping can only be used when: \\n(a) The single dimensional"\n " partitions definition is a dimension of the MultiPartitionsDefinition.\\n(b) The single"\n " dimensional partitions definition is a TimeWindowPartitionsDefinition and the"\n " MultiPartitionsDefinition has a single time dimension.",\n )\n\n\ndef infer_partition_mapping(\n partition_mapping: Optional[PartitionMapping],\n downstream_partitions_def: Optional[PartitionsDefinition],\n upstream_partitions_def: Optional[PartitionsDefinition],\n) -> PartitionMapping:\n from .time_window_partition_mapping import TimeWindowPartitionMapping\n\n if partition_mapping is not None:\n return partition_mapping\n elif upstream_partitions_def and downstream_partitions_def:\n if _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n ).can_infer:\n with disable_dagster_warnings():\n return MultiToSingleDimensionPartitionMapping()\n elif isinstance(upstream_partitions_def, TimeWindowPartitionsDefinition) and isinstance(\n downstream_partitions_def, TimeWindowPartitionsDefinition\n ):\n return TimeWindowPartitionMapping()\n else:\n return IdentityPartitionMapping()\n else:\n return AllPartitionMapping()\n\n\ndef get_builtin_partition_mapping_types() -> Tuple[Type[PartitionMapping], ...]:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n return (\n AllPartitionMapping,\n IdentityPartitionMapping,\n LastPartitionMapping,\n SpecificPartitionsPartitionMapping,\n StaticPartitionMapping,\n TimeWindowPartitionMapping,\n MultiToSingleDimensionPartitionMapping,\n MultiPartitionMapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_mapping"}, "partitioned_schedule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partitioned_schedule

\nfrom typing import Callable, Mapping, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .decorators.schedule_decorator import schedule\nfrom .job_definition import JobDefinition\nfrom .multi_dimensional_partitions import MultiPartitionsDefinition\nfrom .partition import PartitionsDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import (\n    TimeWindowPartitionsDefinition,\n    get_time_partitions_def,\n    has_one_dimension_time_window_partitioning,\n)\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\n\nclass UnresolvedPartitionedAssetScheduleDefinition(NamedTuple):\n    """Points to an unresolved asset job. The asset selection isn't resolved yet, so we can't resolve\n    the PartitionsDefinition, so we can't resolve the schedule cadence.\n    """\n\n    name: str\n    job: UnresolvedAssetJobDefinition\n    description: Optional[str]\n    default_status: DefaultScheduleStatus\n    minute_of_hour: Optional[int]\n    hour_of_day: Optional[int]\n    day_of_week: Optional[int]\n    day_of_month: Optional[int]\n    tags: Optional[Mapping[str, str]]\n\n    def resolve(self, resolved_job: JobDefinition) -> ScheduleDefinition:\n        partitions_def = resolved_job.partitions_def\n        if partitions_def is None:\n            check.failed(\n                f"Job '{resolved_job.name}' provided to build_schedule_from_partitioned_job must"\n                " contain partitioned assets or a partitions definition."\n            )\n\n        partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n        time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n        return ScheduleDefinition(\n            job=resolved_job,\n            name=self.name,\n            execution_fn=_get_schedule_evaluation_fn(partitions_def, resolved_job, self.tags),\n            execution_timezone=time_partitions_def.timezone,\n            cron_schedule=time_partitions_def.get_cron_schedule(\n                self.minute_of_hour, self.hour_of_day, self.day_of_week, self.day_of_month\n            ),\n        )\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n tags: Optional[Mapping[str, str]] = None,\n) -> Union[UnresolvedPartitionedAssetScheduleDefinition, ScheduleDefinition]:\n """Creates a schedule from a time window-partitioned job or a job that targets\n time window-partitioned assets. The job can also be multipartitioned, as long as one\n of the partitions dimensions is time-partitioned.\n\n The schedule executes at the cadence specified by the time partitioning of the job or assets.\n\n Examples:\n .. code-block:: python\n\n ######################################\n # Job that targets partitioned assets\n ######################################\n\n from dagster import (\n DailyPartitionsDefinition,\n asset,\n build_schedule_from_partitioned_job,\n define_asset_job,\n )\n\n @asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def asset1():\n ...\n\n asset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n # The created schedule will fire daily\n asset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\n defs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n ################\n # Non-asset job\n ################\n\n from dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n @job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def do_stuff_partitioned():\n ...\n\n # The created schedule will fire daily\n do_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n do_stuff_partitioned,\n )\n\n defs = Definitions(schedules=[do_stuff_partitioned_schedule])\n """\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to"\n " build_schedule_from_partitioned_job.",\n )\n\n if isinstance(job, UnresolvedAssetJobDefinition) and job.partitions_def is None:\n return UnresolvedPartitionedAssetScheduleDefinition(\n job=job,\n default_status=default_status,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n minute_of_hour=minute_of_hour,\n hour_of_day=hour_of_day,\n day_of_week=day_of_week,\n day_of_month=day_of_month,\n tags=tags,\n )\n else:\n partitions_def = job.partitions_def\n if partitions_def is None:\n check.failed("The provided job is not partitioned")\n\n partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n return schedule(\n cron_schedule=time_partitions_def.get_cron_schedule(\n minute_of_hour, hour_of_day, day_of_week, day_of_month\n ),\n job=job,\n default_status=default_status,\n execution_timezone=time_partitions_def.timezone,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n )(_get_schedule_evaluation_fn(partitions_def, job, tags))
\n\n\ndef _get_schedule_evaluation_fn(\n partitions_def: PartitionsDefinition,\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n tags: Optional[Mapping[str, str]] = None,\n) -> Callable[[ScheduleEvaluationContext], Union[SkipReason, RunRequest, RunRequestIterator]]:\n def schedule_fn(context):\n # Run for the latest partition. Prior partitions will have been handled by prior ticks.\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n partition_key = partitions_def.get_last_partition_key(context.scheduled_execution_time)\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return job.run_request_for_partition(\n partition_key=partition_key,\n run_key=partition_key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n )\n else:\n check.invariant(isinstance(partitions_def, MultiPartitionsDefinition))\n time_window_dimension = partitions_def.time_window_dimension\n partition_key = time_window_dimension.partitions_def.get_last_partition_key(\n context.scheduled_execution_time\n )\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return [\n job.run_request_for_partition(\n partition_key=key,\n run_key=key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n for key in partitions_def.get_multipartition_keys_with_dimension_value(\n time_window_dimension.name,\n partition_key,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n ]\n\n return schedule_fn\n\n\ndef _check_valid_schedule_partitions_def(\n partitions_def: PartitionsDefinition,\n) -> Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition]:\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise DagsterInvalidDefinitionError(\n "Tried to build a partitioned schedule from an asset job, but received an invalid"\n " partitions definition. The permitted partitions definitions are: \\n1."\n " TimeWindowPartitionsDefinition\\n2. MultiPartitionsDefinition with a single"\n " TimeWindowPartitionsDefinition dimension"\n )\n\n return cast(Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def)\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n
", "current_page_name": "_modules/dagster/_core/definitions/partitioned_schedule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partitioned_schedule"}, "policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", PublicAttr[int]),\n ("delay", PublicAttr[Optional[check.Numeric]]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", PublicAttr[Optional[Backoff]]),\n ("jitter", PublicAttr[Optional[Jitter]]),\n ],\n ),\n):\n """A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(\n attempt_num: int, backoff: Optional[Backoff], jitter: Optional[Jitter], base_delay: float\n) -> float:\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/_core/definitions/policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.policy"}, "reconstruct": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.reconstruct

\nimport inspect\nimport json\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import experimental\nfrom dagster._core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    JobPythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster._serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\nfrom dagster._utils import hash_collection\n\nfrom .events import AssetKey\nfrom .job_base import IJob\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.repository_definition import (\n        PendingRepositoryDefinition,\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.source_asset import SourceAsset\n\n    from .graph_definition import GraphDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(job_name: str) -> str:\n    check.str_param(job_name, "job_name")\n    return f"__repository__{job_name}"\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", Sequence[str]),\n            ("container_context", Optional[Mapping[str, Any]]),\n            ("repository_load_data", Optional["RepositoryLoadData"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer: CodePointer,\n        container_image: Optional[str] = None,\n        executable_path: Optional[str] = None,\n        entry_point: Optional[Sequence[str]] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n        repository_load_data: Optional["RepositoryLoadData"] = None,\n    ):\n        from dagster._core.definitions.repository_definition import RepositoryLoadData\n\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                check.sequence_param(entry_point, "entry_point", of_type=str)\n                if entry_point is not None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n            container_context=(\n                check.mapping_param(container_context, "container_context")\n                if container_context is not None\n                else None\n            ),\n            repository_load_data=check.opt_inst_param(\n                repository_load_data, "repository_load_data", RepositoryLoadData\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableRepository":\n        return self._replace(repository_load_data=metadata)\n\n    def get_definition(self) -> "RepositoryDefinition":\n        return repository_def_from_pointer(self.pointer, self.repository_load_data)\n\n    def get_reconstructable_job(self, name: str) -> "ReconstructableJob":\n        return ReconstructableJob(self, name)\n\n    @classmethod\n    def for_file(\n        cls,\n        file: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(\n            FileCodePointer(file, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    @classmethod\n    def for_module(\n        cls,\n        module: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        return cls(\n            ModuleCodePointer(module, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    def get_python_origin(self) -> RepositoryPythonOrigin:\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n            container_context=self.container_context,\n        )\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has `Sequence` attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\nclass ReconstructableJobSerializer(NamedTupleSerializer):\n    def before_unpack(self, _, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n        solid_selection_str = unpacked_dict.get("solid_selection_str")\n        solids_to_execute = unpacked_dict.get("solids_to_execute")\n        if solid_selection_str:\n            unpacked_dict["op_selection"] = json.loads(solid_selection_str)\n        elif solids_to_execute:\n            unpacked_dict["op_selection"] = solids_to_execute\n        return unpacked_dict\n\n    def after_pack(self, **packed_dict: Any) -> Dict[str, Any]:\n        if packed_dict["op_selection"]:\n            packed_dict["solid_selection_str"] = json.dumps(packed_dict["op_selection"]["__set__"])\n        else:\n            packed_dict["solid_selection_str"] = None\n        del packed_dict["op_selection"]\n        return packed_dict\n\n\n@whitelist_for_serdes(\n    serializer=ReconstructableJobSerializer,\n    storage_name="ReconstructablePipeline",\n    storage_field_names={\n        "job_name": "pipeline_name",\n    },\n)\nclass ReconstructableJob(\n    NamedTuple(\n        "_ReconstructableJob",\n        [\n            ("repository", ReconstructableRepository),\n            ("job_name", str),\n            ("op_selection", Optional[AbstractSet[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    ),\n    IJob,\n):\n    """Defines a reconstructable job. When your job must cross process boundaries, Dagster must know\n    how to reconstruct the job on the other side of the process boundary.\n\n    Args:\n        repository (ReconstructableRepository): The reconstructable representation of the repository\n            the job belongs to.\n        job_name (str): The name of the job.\n        op_selection (Optional[AbstractSet[str]]): A set of op query strings. Ops matching any of\n            these queries will be selected. None if no selection is specified.\n        asset_selection (Optional[AbstractSet[AssetKey]]) A set of assets to execute. None if no selection\n            is specified, i.e. the entire job will be run.\n    """\n\n    def __new__(\n        cls,\n        repository: ReconstructableRepository,\n        job_name: str,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ):\n        op_selection = set(op_selection) if op_selection else None\n        return super(ReconstructableJob, cls).__new__(\n            cls,\n            repository=check.inst_param(repository, "repository", ReconstructableRepository),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_set_param(op_selection, "op_selection", of_type=str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableJob":\n        return self._replace(repository=self.repository.with_repository_load_data(metadata))\n\n    # Keep the most recent 1 definition (globally since this is a NamedTuple method)\n    # This allows repeated calls to get_definition in execution paths to not reload the job\n    @lru_cache(maxsize=1)\n    def get_definition(self) -> "JobDefinition":\n        return self.repository.get_definition().get_maybe_subset_job_def(\n            self.job_name,\n            self.op_selection,\n            self.asset_selection,\n            self.asset_check_selection,\n        )\n\n    def get_reconstructable_repository(self) -> ReconstructableRepository:\n        return self.repository\n\n    def get_subset(\n        self,\n        *,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ) -> Self:\n        if op_selection and (asset_selection or asset_check_selection):\n            check.failed(\n                "op_selection and asset_selection or asset_check_selection cannot both be provided"\n                " as arguments",\n            )\n        op_selection = set(op_selection) if op_selection else None\n        return ReconstructableJob(\n            repository=self.repository,\n            job_name=self.job_name,\n            op_selection=op_selection,\n            asset_selection=asset_selection,\n            asset_check_selection=asset_check_selection,\n        )\n\n    def describe(self) -> str:\n        return f'"{self.job_name}" in repository ({self.repository.pointer.describe})'\n\n    @staticmethod\n    def for_file(python_file: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(FileCodePointer(python_file, fn_name, os.getcwd()))\n\n    @staticmethod\n    def for_module(module: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n    def to_dict(self) -> Mapping[str, object]:\n        return pack_value(self)\n\n    @staticmethod\n    def from_dict(val: Mapping[str, Any]) -> "ReconstructableJob":\n        check.mapping_param(val, "val")\n\n        inst = unpack_value(val)\n        check.invariant(\n            isinstance(inst, ReconstructableJob),\n            f"Deserialized object is not instance of ReconstructableJob, got {type(inst)}",\n        )\n        return inst  # type: ignore  # (illegible runtime check)\n\n    def get_python_origin(self) -> JobPythonOrigin:\n        return JobPythonOrigin(self.job_name, self.repository.get_python_origin())\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    def get_module(self) -> Optional[str]:\n        """Return the module the job is found in, the origin is a module code pointer."""\n        pointer = self.get_python_origin().get_repo_pointer()\n        if isinstance(pointer, ModuleCodePointer):\n            return pointer.module\n\n        return None\n\n    # Allow this to be hashed for `lru_cache` in `get_definition`\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]def reconstructable(target: Callable[..., "JobDefinition"]) -> ReconstructableJob:\n """Create a :py:class:`~dagster._core.definitions.reconstructable.ReconstructableJob` from a\n function that returns a :py:class:`~dagster.JobDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@job <dagster.job>`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster._core.definitions import JobDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, JobDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n f"by a decorated function, got {type(target)}.",\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n f'Reconstructable target "{target.__name__}" has a different '\n f'__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job."\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and getattr(inspect.getmodule(target), "__name__", None) != "__main__"\n ):\n return ReconstructableJob.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a job defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_job(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name: str,\n reconstructor_function_name: str,\n reconstructable_args: Optional[Tuple[object]] = None,\n reconstructable_kwargs: Optional[Mapping[str, object]] = None,\n reconstructor_working_directory: Optional[str] = None,\n) -> ReconstructableJob:\n """Create a :py:class:`dagster._core.definitions.reconstructable.ReconstructableJob`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n _reconstructable_args: List[object] = list(\n check.opt_tuple_param(reconstructable_args, "reconstructable_args")\n )\n _reconstructable_kwargs: List[List[Union[str, object]]] = list(\n (\n [key, value]\n for key, value in check.opt_mapping_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, _reconstructable_args, _reconstructable_kwargs)\n\n job_def = job_def_from_pointer(pointer)\n\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )
\n\n\ndef bootstrap_standalone_recon_job(pointer: CodePointer) -> ReconstructableJob:\n # So this actually straps the the job for the sole\n # purpose of getting the job name. If we changed ReconstructableJob\n # to get the job on demand in order to get name, we could avoid this.\n job_def = job_def_from_pointer(pointer)\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )\n\n\nLoadableDefinition: TypeAlias = Union[\n "JobDefinition",\n "RepositoryDefinition",\n "PendingRepositoryDefinition",\n "GraphDefinition",\n "Sequence[Union[AssetsDefinition, SourceAsset]]",\n]\n\nT_LoadableDefinition = TypeVar("T_LoadableDefinition", bound=LoadableDefinition)\n\n\ndef _is_list_of_assets(\n definition: LoadableDefinition,\n) -> bool:\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n return isinstance(definition, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in definition\n )\n\n\ndef _check_is_loadable(definition: T_LoadableDefinition) -> T_LoadableDefinition:\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if not (\n isinstance(\n definition,\n (\n JobDefinition,\n RepositoryDefinition,\n PendingRepositoryDefinition,\n GraphDefinition,\n Definitions,\n ),\n )\n or _is_list_of_assets(definition)\n ):\n raise DagsterInvariantViolationError(\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"or RepositoryDefinition. Got {definition!r}."\n )\n return definition\n\n\ndef load_def_in_module(\n module_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(\n package_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(\n python_file: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> LoadableDefinition:\n target = pointer.load_target()\n\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if isinstance(\n target,\n (\n GraphDefinition,\n JobDefinition,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n ),\n ) or not callable(target):\n return _check_is_loadable(target) # type: ignore\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_arg_names(target):\n raise DagsterInvariantViolationError(\n f"Error invoking function at {pointer.describe()} with no arguments. "\n "Reconstructable target must be callable with no arguments"\n )\n\n return _check_is_loadable(target())\n\n\ndef job_def_from_pointer(pointer: CodePointer) -> "JobDefinition":\n from .job_definition import JobDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, JobDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or JobDefinition for legacy"\n " code). Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\ndef repository_def_from_target_def(\n target: Union["RepositoryDefinition", "JobDefinition", "GraphDefinition"],\n repository_load_data: Optional["RepositoryLoadData"] = None,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> None: ...\n\n\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> Optional["RepositoryDefinition"]:\n from .assets import AssetsDefinition\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import (\n SINGLETON_REPOSITORY_NAME,\n CachingRepositoryData,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n )\n from .source_asset import SourceAsset\n\n if isinstance(target, Definitions):\n # reassign to handle both repository and pending repo case\n target = target.get_inner_repository_for_loading_process()\n\n # special case - we can wrap a single job in a repository\n if isinstance(target, (JobDefinition, GraphDefinition)):\n # consider including job name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in target\n ):\n return RepositoryDefinition(\n name=SINGLETON_REPOSITORY_NAME,\n repository_data=CachingRepositoryData.from_list(target),\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n elif isinstance(target, PendingRepositoryDefinition):\n # must load repository from scratch\n if repository_load_data is None:\n return target.compute_repository_definition()\n # can use the cached data to more efficiently load data\n return target.reconstruct_repository_definition(repository_load_data)\n else:\n return None\n\n\ndef repository_def_from_pointer(\n pointer: CodePointer, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target, repository_load_data)\n if not repo_def:\n raise DagsterInvariantViolationError(\n f"CodePointer ({pointer.describe()}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or JobDefinition. "\n f"Received a {type(target)}"\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/_core/definitions/reconstruct", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.reconstruct"}, "repository_definition": {"repository_data": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_data

\nfrom abc import ABC, abstractmethod\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.graph_definition import SubselectedGraphDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\n\nfrom .caching_index import CacheingDefinitionIndex\nfrom .valid_definitions import RepositoryListDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\n
[docs]class RepositoryData(ABC):\n """Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n pass\n\n @abstractmethod\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n """Return all top-level resources in the repository as a list,\n such as those provided to the Definitions constructor.\n\n Returns:\n List[ResourceDefinition]: All top-level resources in the repository.\n """\n\n @abstractmethod\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n pass\n\n
[docs] @abstractmethod\n @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """
\n\n
[docs] @public\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]
\n\n
[docs] @public\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()
\n\n
[docs] @public\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match
\n\n
[docs] @public\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]
\n\n
[docs] @public\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All jobs in the repository.\n """\n return []
\n\n
[docs] @public\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]
\n\n
[docs] @public\n def has_schedule(self, schedule_name: str) -> bool:\n """Check if a schedule with a given name is present in the repository."""\n return schedule_name in self.get_schedule_names()
\n\n
[docs] @public\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: Return all sensors in the repository as a list."""\n return []
\n\n
[docs] @public\n def get_sensor_names(self) -> Sequence[str]:\n """Sequence[str]: Get the names of all sensors in the repository."""\n return [sensor.name for sensor in self.get_all_sensors()]
\n\n
[docs] @public\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n """Get a sensor by name.\n\n Args:\n sensor_name (str): name of the sensor to retrieve.\n\n Returns:\n SensorDefinition: The sensor definition corresponding to the given name.\n """\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]
\n\n
[docs] @public\n def has_sensor(self, sensor_name: str) -> bool:\n """Check if a sensor with a given name is present in the repository."""\n return sensor_name in self.get_sensor_names()
\n\n
[docs] @public\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n """Mapping[AssetKey, SourceAsset]: Get the source assets for the repository."""\n return {}
\n\n
[docs] @public\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n """Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository."""\n return {}
\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self.get_all_jobs()\n self.get_all_schedules()\n self.get_all_sensors()\n self.get_source_assets_by_key()
\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[Sequence[JobDefinition]]\n _all_pipelines: Optional[Sequence[JobDefinition]]\n\n def __init__(\n self,\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n assets_defs_by_key: Mapping[AssetKey, "AssetsDefinition"],\n top_level_resources: Mapping[str, ResourceDefinition],\n utilized_env_vars: Mapping[str, AbstractSet[str]],\n resource_key_mapping: Mapping[int, str],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, and schedule definitions directly, or you may pass callables\n with no arguments that will be invoked to lazily construct definitions when accessed by\n name. This can be helpful for performance when there are many definitions in a repository,\n or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets_by_key (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n assets_defs_by_key (Mapping[AssetKey, AssetsDefinition]): The assets definitions\n belonging to a repository.\n top_level_resources (Mapping[str, ResourceDefinition]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from dagster._core.definitions import AssetsDefinition\n\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets_by_key, "source_assets_by_key", key_type=AssetKey, value_type=SourceAsset\n )\n check.mapping_param(\n assets_defs_by_key, "assets_defs_by_key", key_type=AssetKey, value_type=AssetsDefinition\n )\n check.mapping_param(\n top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n )\n check.mapping_param(\n utilized_env_vars,\n "utilized_resources",\n key_type=str,\n )\n check.mapping_param(\n resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n )\n\n self._jobs = CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n # load all schedules to force validation\n self._schedules.get_all_definitions()\n\n self._source_assets_by_key = source_assets_by_key\n self._assets_defs_by_key = assets_defs_by_key\n self._top_level_resources = top_level_resources\n self._utilized_env_vars = utilized_env_vars\n self._resource_key_mapping = resource_key_mapping\n\n self._sensors = CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n from .repository_data_builder import build_caching_repository_data_from_dict\n\n return build_caching_repository_data_from_dict(repository_definitions)\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: Sequence[RepositoryListDefinition],\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n resource_key_mapping: Optional[Mapping[int, str]] = None,\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[JobDefinition, ScheduleDefinition, SensorDefinition, GraphDefinition]]):\n Use this constructor when you have no need to lazy load jobs or other definitions.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from .repository_data_builder import build_caching_repository_data_from_list\n\n return build_caching_repository_data_from_list(\n repository_definitions=repository_definitions,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=top_level_resources,\n resource_key_mapping=resource_key_mapping,\n )\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._utilized_env_vars\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._resource_key_mapping\n\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._top_level_resources\n\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n self._all_jobs = self._jobs.get_all_definitions()\n self._check_node_defs(self._all_jobs)\n return self._all_jobs\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> Sequence[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets_by_key\n\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._assets_defs_by_key\n\n def _check_node_defs(self, job_defs: Sequence[JobDefinition]) -> None:\n node_defs = {}\n node_to_job = {}\n for job_def in job_defs:\n for node_def in [*job_def.all_node_defs, job_def.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(node_def, SubselectedGraphDefinition):\n break\n\n if node_def.name not in node_defs:\n node_defs[node_def.name] = node_def\n node_to_job[node_def.name] = job_def.name\n\n if node_defs[node_def.name] is not node_def:\n first_name, second_name = sorted([node_to_job[node_def.name], job_def.name])\n raise DagsterInvalidDefinitionError(\n f"Conflicting definitions found in repository with name '{node_def.name}'."\n " Op/Graph definition names must be unique within a repository."\n f" {node_def.__class__.__name__} is defined in"\n f" job '{first_name}' and in"\n f" job '{second_name}'."\n )\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n job_names = self.get_job_names()\n\n if schedule.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job "{schedule.job_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n job_names = self.get_job_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a job\n return sensor\n\n for target in sensor.targets:\n if target.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job "{sensor.job_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_data", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_data"}, "repository_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.asset_graph import AssetGraph, InternalAssetGraph\nfrom dagster._core.definitions.assets_job import (\n    ASSET_BASE_JOB_PREFIX,\n)\nfrom dagster._core.definitions.cacheable_assets import AssetsDefinitionCacheableData\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.metadata import MetadataMapping\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import hash_collection\n\nfrom .repository_data import CachingRepositoryData, RepositoryData\nfrom .valid_definitions import (\n    RepositoryListDefinition as RepositoryListDefinition,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n    from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n@whitelist_for_serdes\nclass RepositoryLoadData(\n    NamedTuple(\n        "_RepositoryLoadData",\n        [\n            ("cached_data_by_key", Mapping[str, Sequence[AssetsDefinitionCacheableData]]),\n        ],\n    )\n):\n    def __new__(cls, cached_data_by_key: Mapping[str, Sequence[AssetsDefinitionCacheableData]]):\n        return super(RepositoryLoadData, cls).__new__(\n            cls,\n            cached_data_by_key=(\n                check.mapping_param(\n                    cached_data_by_key,\n                    "cached_data_by_key",\n                    key_type=str,\n                    value_type=list,\n                )\n            ),\n        )\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has a `RepositoryLoadData` attribute\n    # - `RepositoryLoadData` has collection attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[MetadataMapping]): A map of arbitrary metadata for the repository.\n """\n\n def __init__(\n self,\n name,\n *,\n repository_data,\n description=None,\n metadata=None,\n repository_load_data=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data: RepositoryData = check.inst_param(\n repository_data, "repository_data", RepositoryData\n )\n self._metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self._repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n @property\n def repository_load_data(self) -> Optional[RepositoryLoadData]:\n return self._repository_load_data\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the repository."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the repository."""\n return self._description\n\n @public\n @property\n def metadata(self) -> Optional[MetadataMapping]:\n """Optional[MetadataMapping]: Arbitrary metadata for the repository."""\n return self._metadata\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self._repository_data.load_all_definitions()\n\n @public\n @property\n def job_names(self) -> Sequence[str]:\n """List[str]: Names of all jobs in the repository."""\n return self._repository_data.get_job_names()\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._repository_data.get_top_level_resources()\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._repository_data.get_env_vars_by_top_level_resource()\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._repository_data.get_resource_key_mapping()\n\n
[docs] @public\n def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] @public\n def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @public\n @property\n def schedule_defs(self) -> Sequence[ScheduleDefinition]:\n """List[ScheduleDefinition]: All schedules in the repository."""\n return self._repository_data.get_all_schedules()\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name.\n\n Args:\n name (str): The name of the schedule.\n\n Returns:\n ScheduleDefinition: The schedule definition.\n """\n return self._repository_data.get_schedule(name)
\n\n
[docs] @public\n def has_schedule_def(self, name: str) -> bool:\n """bool: Check if a schedule with a given name is present in the repository."""\n return self._repository_data.has_schedule(name)
\n\n @public\n @property\n def sensor_defs(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: All sensors in the repository."""\n return self._repository_data.get_all_sensors()\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name.\n\n Args:\n name (str): The name of the sensor.\n\n Returns:\n SensorDefinition: The sensor definition.\n """\n return self._repository_data.get_sensor(name)
\n\n
[docs] @public\n def has_sensor_def(self, name: str) -> bool:\n """bool: Check if a sensor with a given name is present in the repository."""\n return self._repository_data.has_sensor(name)
\n\n @property\n def source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._repository_data.get_assets_defs_by_key()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n """Returns true is there is a single implicit asset job for all asset keys in a repository."""\n return self.has_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method for repositories where there are a set of assets with\n the same partitioning schema and one wants to access their corresponding implicit job\n easily.\n """\n if not self.has_job(ASSET_BASE_JOB_PREFIX):\n raise DagsterInvariantViolationError(\n "There is no single global asset job, likely due to assets using "\n "different partitioning schemes via their partitions_def parameter. You must "\n "use get_implicit_job_def_for_assets in order to access the correct implicit job."\n )\n\n return self.get_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_asset_job_names(self) -> Sequence[str]:\n return [\n job_name for job_name in self.job_names if job_name.startswith(ASSET_BASE_JOB_PREFIX)\n ]\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n """Returns the asset base job that contains all the given assets, or None if there is no such\n job.\n """\n if self.has_job(ASSET_BASE_JOB_PREFIX):\n base_job = self.get_job(ASSET_BASE_JOB_PREFIX)\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n else:\n i = 0\n while self.has_job(f"{ASSET_BASE_JOB_PREFIX}_{i}"):\n base_job = self.get_job(f"{ASSET_BASE_JOB_PREFIX}_{i}")\n\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n\n i += 1\n\n return None\n\n def get_maybe_subset_job_def(\n self,\n job_name: str,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ):\n defn = self.get_job(job_name)\n return defn.get_subset(\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n )\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Any] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n with AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n ) as loader:\n return loader.load_asset_value(\n asset_key,\n python_type=python_type,\n partition_key=partition_key,\n metadata=metadata,\n resource_config=resource_config,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with my_repo.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n return AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n )
\n\n @property\n def asset_graph(self) -> InternalAssetGraph:\n return AssetGraph.from_assets(\n [*set(self.assets_defs_by_key.values()), *self.source_assets_by_key.values()]\n )\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n\n\nclass PendingRepositoryDefinition:\n def __init__(\n self,\n name: str,\n repository_definitions: Sequence[\n Union[RepositoryListDefinition, "CacheableAssetsDefinition"]\n ],\n description: Optional[str] = None,\n metadata: Optional[MetadataMapping] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n ):\n self._repository_definitions = check.list_param(\n repository_definitions,\n "repository_definition",\n additional_message=(\n "PendingRepositoryDefinition supports only list-based repository data at this time."\n ),\n )\n self._name = name\n self._description = description\n self._metadata = metadata\n self._default_logger_defs = default_logger_defs\n self._default_executor_def = default_executor_def\n self._top_level_resources = _top_level_resources\n self._resource_key_mapping = _resource_key_mapping\n\n @property\n def name(self) -> str:\n return self._name\n\n def _compute_repository_load_data(self) -> RepositoryLoadData:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n return RepositoryLoadData(\n cached_data_by_key={\n defn.unique_id: defn.compute_cacheable_data()\n for defn in self._repository_definitions\n if isinstance(defn, CacheableAssetsDefinition)\n }\n )\n\n def _get_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n resolved_definitions: List[RepositoryListDefinition] = []\n for defn in self._repository_definitions:\n if isinstance(defn, CacheableAssetsDefinition):\n # should always have metadata for each cached defn at this point\n check.invariant(\n defn.unique_id in repository_load_data.cached_data_by_key,\n "No metadata found for CacheableAssetsDefinition with unique_id"\n f" {defn.unique_id}.",\n )\n # use the emtadata to generate definitions\n resolved_definitions.extend(\n defn.build_definitions(\n data=repository_load_data.cached_data_by_key[defn.unique_id]\n )\n )\n else:\n resolved_definitions.append(defn)\n\n repository_data = CachingRepositoryData.from_list(\n resolved_definitions,\n default_executor_def=self._default_executor_def,\n default_logger_defs=self._default_logger_defs,\n top_level_resources=self._top_level_resources,\n resource_key_mapping=self._resource_key_mapping,\n )\n\n return RepositoryDefinition(\n self._name,\n repository_data=repository_data,\n description=self._description,\n metadata=self._metadata,\n repository_load_data=repository_load_data,\n )\n\n def reconstruct_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n """Use the provided RepositoryLoadData to construct and return a RepositoryDefinition."""\n check.inst_param(repository_load_data, "repository_load_data", RepositoryLoadData)\n return self._get_repository_definition(repository_load_data)\n\n def compute_repository_definition(self) -> RepositoryDefinition:\n """Compute the required RepositoryLoadData and use it to construct and return a RepositoryDefinition."""\n repository_load_data = self._compute_repository_load_data()\n return self._get_repository_definition(repository_load_data)\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_definition"}}, "resource_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.resource_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.decorator_utils import format_docstring_for_description\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._utils import IHasInternalInit\n\nfrom ..decorator_utils import (\n    get_function_params,\n    has_at_least_one_parameter,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\nfrom .resource_requirement import (\n    RequiresResources,\n    ResourceDependencyRequirement,\n    ResourceRequirement,\n)\nfrom .scoped_resources_builder import (  # re-exported\n    IContainsGenerator as IContainsGenerator,\n    Resources as Resources,\n    ScopedResourcesBuilder as ScopedResourcesBuilder,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.resources_init import InitResourceContext\n\nResourceFunctionWithContext: TypeAlias = Callable[["InitResourceContext"], Any]\nResourceFunctionWithoutContext: TypeAlias = Callable[[], Any]\nResourceFunction: TypeAlias = Union[\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n]\n\n\n
[docs]@experimental_param(param="version")\nclass ResourceDefinition(AnonymousConfigurableDefinition, RequiresResources, IHasInternalInit):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n ops and assets during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n\n # this attribute will be updated by the @dagster_maintained_resource and @dagster_maintained_io_manager decorators\n self._dagster_maintained = False\n self._hardcoded_resource_type = None\n\n @staticmethod\n def dagster_internal_init(\n *,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema,\n description: Optional[str],\n required_resource_keys: Optional[AbstractSet[str]],\n version: Optional[str],\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def resource_fn(self) -> ResourceFunction:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of the resource."""\n return self._description\n\n @public\n @property\n def version(self) -> Optional[str]:\n """A string which can be used to identify a particular code version of a resource definition."""\n return self._version\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """A set of the resource keys that this resource depends on. These keys will be made available\n to the resource's init context during execution, and the resource will not be instantiated\n until all required resources are available.\n """\n return self._required_resource_keys\n\n def _is_dagster_maintained(self) -> bool:\n return self._dagster_maintained\n\n
[docs] @public\n @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @public\n @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n resource_def = ResourceDefinition(\n resource_fn=lambda _init_context: value, description=description\n )\n # Make sure telemetry info gets passed in to hardcoded resources\n if hasattr(value, "_is_dagster_maintained"):\n resource_def._dagster_maintained = value._is_dagster_maintained() # noqa: SLF001\n resource_def._hardcoded_resource_type = type(value) # noqa: SLF001\n\n return resource_def
\n\n
[docs] @public\n @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n
[docs] @public\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """Creates a ``ResourceDefinition`` which takes in a single string as configuration\n and returns this configured string to any ops or assets which depend on it.\n\n Args:\n description ([Optional[str]]): The description of the string resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that takes in a single string as configuration and\n returns that string.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )
\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "ResourceDefinition":\n resource_def = ResourceDefinition.dagster_internal_init(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n resource_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return resource_def\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.init import UnboundInitResourceContext\n\n if has_at_least_one_parameter(self.resource_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was"\n " provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, UnboundInitResourceContext)\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], args[0])\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, UnboundInitResourceContext\n )\n\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], kwargs[context_param_name])\n )\n elif len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke resource with argument, but underlying function has no context"\n " argument. Either specify a context argument on the resource function, or remove"\n " the passed-in argument."\n )\n else:\n return resource_invocation_result(self, None)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n source_key = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield ResourceDependencyRequirement(key=resource_key, source_key=source_key)
\n\n\ndef dagster_maintained_resource(\n resource_def: ResourceDefinition,\n) -> ResourceDefinition:\n resource_def._dagster_maintained = True # noqa: SLF001\n return resource_def\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: ResourceFunction) -> ResourceDefinition:\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if has_at_least_one_parameter(resource_fn) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single"\n " positional required argument. Got required extra params"\n f" {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition.dagster_internal_init(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description or format_docstring_for_description(resource_fn),\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(resource_def, wrapped=resource_fn) # type: ignore\n\n return resource_def\n\n\n@overload\ndef resource(config_schema: ResourceFunction) -> ResourceDefinition: ...\n\n\n@overload\ndef resource(\n config_schema: CoercableToConfigSchema = ...,\n description: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n version: Optional[str] = ...,\n) -> Callable[[ResourceFunction], "ResourceDefinition"]: ...\n\n\n
[docs]def resource(\n config_schema: Union[ResourceFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[Callable[[ResourceFunction], "ResourceDefinition"], "ResourceDefinition"]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: ResourceFunction) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/resource_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.resource_definition"}, "result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.result

\nfrom typing import NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.data_version import DataVersion\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .metadata import MetadataUserInput\n\n\n
[docs]@experimental\nclass MaterializeResult(\n NamedTuple(\n "_MaterializeResult",\n [\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("check_results", PublicAttr[Sequence[AssetCheckResult]]),\n ("data_version", PublicAttr[Optional[DataVersion]]),\n ],\n )\n):\n """An object representing a successful materialization of an asset. These can be returned from\n @asset and @multi_asset decorated functions to pass metadata or specify specific assets were\n materialized.\n\n Attributes:\n asset_key (Optional[AssetKey]): Optional in @asset, required in @multi_asset to discern which asset this refers to.\n metadata (Optional[MetadataUserInput]): Metadata to record with the corresponding AssetMaterialization event.\n """\n\n def __new__(\n cls,\n *, # enforce kwargs\n asset_key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[MetadataUserInput] = None,\n check_results: Optional[Sequence[AssetCheckResult]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n metadata=check.opt_nullable_mapping_param(\n metadata,\n "metadata",\n key_type=str,\n ),\n check_results=check.opt_sequence_param(\n check_results, "check_results", of_type=AssetCheckResult\n ),\n data_version=check.opt_inst_param(data_version, "data_version", DataVersion),\n )\n\n def check_result_named(self, check_name: str) -> AssetCheckResult:\n for check_result in self.check_results:\n if check_result.check_name == check_name:\n return check_result\n\n check.failed(f"Could not find check result named {check_name}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.result"}, "run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_config

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nfrom dagster._config import (\n    ALL_CONFIG_BUILTINS,\n    ConfigType,\n    Field,\n    Permissive,\n    Selector,\n    Shape,\n)\nfrom dagster._config.pythonic_config import Config\nfrom dagster._core.definitions.asset_layer import AssetLayer\nfrom dagster._core.definitions.executor_definition import (\n    ExecutorDefinition,\n    execute_in_process_executor,\n    in_process_executor,\n)\nfrom dagster._core.definitions.input import InputDefinition\nfrom dagster._core.definitions.output import OutputDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.storage.input_manager import IInputManagerDefinition\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition\nfrom dagster._core.types.dagster_type import ALL_RUNTIME_BUILTINS, construct_dagster_type_dictionary\nfrom dagster._utils import check\n\nfrom .configurable import ConfigurableDefinition\nfrom .definition_config_schema import IDefinitionConfigSchema\nfrom .dependency import DependencyStructure, GraphNode, Node, NodeHandle, NodeInput, OpNode\nfrom .graph_definition import GraphDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .op_definition import NodeDefinition, OpDefinition\nfrom .resource_definition import ResourceDefinition\n\nif TYPE_CHECKING:\n    from .source_asset import SourceAsset\n\n\ndef define_resource_dictionary_cls(\n    resource_defs: Mapping[str, ResourceDefinition],\n    required_resources: AbstractSet[str],\n) -> Shape:\n    fields = {}\n    for resource_name, resource_def in resource_defs.items():\n        if resource_def.config_schema:\n            is_required = None\n            if resource_name not in required_resources:\n                # explicitly make section not required if resource is not required\n                # for the current mode\n                is_required = False\n\n            fields[resource_name] = def_config_field(\n                resource_def,\n                is_required=is_required,\n                description=resource_def.description,\n            )\n\n    return Shape(fields=fields)\n\n\ndef remove_none_entries(ddict: Mapping[Any, Any]) -> dict:\n    return {k: v for k, v in ddict.items() if v is not None}\n\n\ndef def_config_field(\n    configurable_def: ConfigurableDefinition,\n    is_required: Optional[bool] = None,\n    description: Optional[str] = None,\n) -> Field:\n    return Field(\n        Shape(\n            {"config": configurable_def.config_field} if configurable_def.has_config_field else {}\n        ),\n        is_required=is_required,\n        description=description,\n    )\n\n\nclass RunConfigSchemaCreationData(NamedTuple):\n    job_name: str\n    nodes: Sequence[Node]\n    graph_def: GraphDefinition\n    dependency_structure: DependencyStructure\n    executor_def: ExecutorDefinition\n    resource_defs: Mapping[str, ResourceDefinition]\n    logger_defs: Mapping[str, LoggerDefinition]\n    ignored_nodes: Sequence[Node]\n    required_resources: AbstractSet[str]\n    direct_inputs: Mapping[str, Any]\n    asset_layer: AssetLayer\n\n\ndef define_logger_dictionary_cls(creation_data: RunConfigSchemaCreationData) -> Shape:\n    return Shape(\n        {\n            logger_name: def_config_field(logger_definition, is_required=False)\n            for logger_name, logger_definition in creation_data.logger_defs.items()\n        }\n    )\n\n\ndef define_execution_field(executor_defs: Sequence[ExecutorDefinition], description: str) -> Field:\n    default_in_process = False\n    for executor_def in executor_defs:\n        if executor_def == in_process_executor:\n            default_in_process = True\n\n    selector = selector_for_named_defs(executor_defs)\n\n    if default_in_process:\n        return Field(\n            selector, default_value={in_process_executor.name: {}}, description=description\n        )\n\n    # If we are using the execute_in_process executor, then ignore all executor config.\n    if len(executor_defs) == 1 and executor_defs[0] == execute_in_process_executor:\n        return Field(Permissive(), is_required=False, default_value={}, description=description)\n\n    return Field(selector, description=description)\n\n\ndef define_single_execution_field(executor_def: ExecutorDefinition, description: str) -> Field:\n    return def_config_field(executor_def, description=description)\n\n\ndef define_run_config_schema_type(creation_data: RunConfigSchemaCreationData) -> ConfigType:\n    execution_field = define_single_execution_field(\n        creation_data.executor_def,\n        "Configure how steps are executed within a run.",\n    )\n\n    top_level_node = GraphNode(\n        name=creation_data.graph_def.name,\n        definition=creation_data.graph_def,\n        graph_definition=creation_data.graph_def,\n    )\n\n    fields = {\n        "execution": execution_field,\n        "loggers": Field(\n            define_logger_dictionary_cls(creation_data),\n            description="Configure how loggers emit messages within a run.",\n        ),\n        "resources": Field(\n            define_resource_dictionary_cls(\n                creation_data.resource_defs,\n                creation_data.required_resources,\n            ),\n            description="Configure how shared resources are implemented within a run.",\n        ),\n        "inputs": get_inputs_field(\n            node=top_level_node,\n            handle=NodeHandle(top_level_node.name, parent=None),\n            dependency_structure=creation_data.dependency_structure,\n            resource_defs=creation_data.resource_defs,\n            node_ignored=False,\n            direct_inputs=creation_data.direct_inputs,\n            input_source_assets={},\n            asset_layer=creation_data.asset_layer,\n        ),\n    }\n\n    if creation_data.graph_def.has_config_mapping:\n        config_schema = cast(IDefinitionConfigSchema, creation_data.graph_def.config_schema)\n        nodes_field = Field(\n            {"config": config_schema.as_field()},\n            description="Configure runtime parameters for ops or assets.",\n        )\n    else:\n        nodes_field = Field(\n            define_node_shape(\n                nodes=creation_data.nodes,\n                ignored_nodes=creation_data.ignored_nodes,\n                dependency_structure=creation_data.dependency_structure,\n                resource_defs=creation_data.resource_defs,\n                asset_layer=creation_data.asset_layer,\n                node_input_source_assets=creation_data.graph_def.node_input_source_assets,\n            ),\n            description="Configure runtime parameters for ops or assets.",\n        )\n\n    fields["ops"] = nodes_field\n\n    return Shape(\n        fields=remove_none_entries(fields),\n    )\n\n\n# Common pattern for a set of named definitions (e.g. executors)\n# to build a selector so that one of them is selected\ndef selector_for_named_defs(named_defs) -> Selector:\n    return Selector({named_def.name: def_config_field(named_def) for named_def in named_defs})\n\n\ndef get_inputs_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    node_ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n    direct_inputs: Optional[Mapping[str, Any]] = None,\n) -> Optional[Field]:\n    direct_inputs = check.opt_mapping_param(direct_inputs, "direct_inputs")\n    inputs_field_fields = {}\n    for name, inp in node.definition.input_dict.items():\n        inp_handle = NodeInput(node, inp)\n        has_upstream = input_has_upstream(dependency_structure, inp_handle, node, name)\n        if inp.input_manager_key:\n            input_field = get_input_manager_input_field(node, inp, resource_defs)\n        elif (\n            # if you have asset definitions, input will be loaded from the source asset\n            asset_layer.has_assets_defs\n            or asset_layer.has_asset_check_defs\n            and asset_layer.asset_key_for_input(handle, name)\n            and not has_upstream\n        ):\n            input_field = None\n        elif name in direct_inputs and not has_upstream:\n            input_field = None\n        elif name in input_source_assets and not has_upstream:\n            input_field = None\n        elif inp.dagster_type.loader and not has_upstream:\n            input_field = get_type_loader_input_field(node, name, inp)\n        else:\n            input_field = None\n\n        if input_field:\n            inputs_field_fields[name] = input_field\n\n    if not inputs_field_fields:\n        return None\n    if node_ignored:\n        return Field(\n            Shape(inputs_field_fields),\n            is_required=False,\n            description=(\n                "This op is not present in the current op selection, "\n                "the input config values are allowed but ignored."\n            ),\n        )\n    else:\n        return Field(Shape(inputs_field_fields))\n\n\ndef input_has_upstream(\n    dependency_structure: DependencyStructure,\n    input_handle: NodeInput,\n    node: Node,\n    input_name: str,\n) -> bool:\n    return dependency_structure.has_deps(input_handle) or node.container_maps_input(input_name)\n\n\ndef get_input_manager_input_field(\n    node: Node,\n    input_def: InputDefinition,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    if input_def.input_manager_key:\n        if input_def.input_manager_key not in resource_defs:\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key"\n                f" '{input_def.input_manager_key}', but no resource has been provided. Please"\n                " include a resource definition for that key in the provided resource_defs."\n            )\n\n        input_manager = resource_defs[input_def.input_manager_key]\n        if not isinstance(input_manager, IInputManagerDefinition):\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key "\n                f"'{input_def.input_manager_key}', but the resource definition provided is not an "\n                "IInputManagerDefinition"\n            )\n\n        input_config_schema = input_manager.input_config_schema\n        if input_config_schema:\n            return input_config_schema.as_field()\n        return None\n\n    return None\n\n\ndef get_type_loader_input_field(node: Node, input_name: str, input_def: InputDefinition) -> Field:\n    loader = check.not_none(input_def.dagster_type.loader)\n    return Field(\n        loader.schema_type,\n        is_required=(not node.definition.input_has_default(input_name)),\n    )\n\n\ndef get_outputs_field(\n    node: Node,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    output_manager_fields = {}\n    for name, output_def in node.definition.output_dict.items():\n        output_manager_output_field = get_output_manager_output_field(\n            node, output_def, resource_defs\n        )\n        if output_manager_output_field:\n            output_manager_fields[name] = output_manager_output_field\n\n    return Field(Shape(output_manager_fields)) if output_manager_fields else None\n\n\ndef get_output_manager_output_field(\n    node: Node, output_def: OutputDefinition, resource_defs: Mapping[str, ResourceDefinition]\n) -> Optional[ConfigType]:\n    if output_def.io_manager_key not in resource_defs:\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but no resource has been provided. Please include a '\n            "resource definition for that key in the provided resource_defs."\n        )\n    if not isinstance(resource_defs[output_def.io_manager_key], IOutputManagerDefinition):\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but the resource definition provided is not an '\n            "IOutputManagerDefinition"\n        )\n    output_manager_def = resource_defs[output_def.io_manager_key]\n    if (\n        output_manager_def\n        and isinstance(output_manager_def, IOutputManagerDefinition)\n        and output_manager_def.output_config_schema\n    ):\n        return output_manager_def.output_config_schema.as_field()\n\n    return None\n\n\ndef node_config_field(fields: Mapping[str, Optional[Field]], ignored: bool) -> Optional[Field]:\n    trimmed_fields = remove_none_entries(fields)\n    if trimmed_fields:\n        if ignored:\n            return Field(\n                Shape(trimmed_fields),\n                is_required=False,\n                description=(\n                    "This op is not present in the current op selection, "\n                    "the config values are allowed but ignored."\n                ),\n            )\n        else:\n            return Field(Shape(trimmed_fields))\n    else:\n        return None\n\n\ndef construct_leaf_node_config(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    config_schema: Optional[IDefinitionConfigSchema],\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    return node_config_field(\n        {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "config": config_schema.as_field() if config_schema else None,\n        },\n        ignored=ignored,\n    )\n\n\ndef define_node_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    # All nodes regardless of compositing status get the same inputs and outputs\n    # config. The only thing the varies is on extra element of configuration\n    # 1) Vanilla op definition: a 'config' key with the config_schema as the value\n    # 2) Graph with field mapping: a 'config' key with the config_schema of\n    #    the config mapping (via GraphDefinition#config_schema)\n    # 3) Graph without field mapping: an 'ops' key with recursively defined\n    #    ops dictionary\n    # 4) `configured` graph with field mapping: a 'config' key with the config_schema that was\n    #    provided when `configured` was called (via GraphDefinition#config_schema)\n\n    assert isinstance(node, (OpNode, GraphNode)), f"Invalid node type: {type(node)}"\n\n    if isinstance(node, OpNode):\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            node.definition.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n\n    graph_def = node.definition\n\n    if graph_def.has_config_mapping:\n        # has_config_mapping covers cases 2 & 4 from above (only config mapped graphs can\n        # be `configured`)...\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            # ...and in both cases, the correct schema for 'config' key is exposed by this property:\n            graph_def.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n        # This case omits an 'ops' key, thus if a graph is `configured` or has a field\n        # mapping, the user cannot stub any config, inputs, or outputs for inner (child) nodes.\n    else:\n        fields = {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "ops": Field(\n                define_node_shape(\n                    nodes=graph_def.nodes,\n                    ignored_nodes=None,\n                    dependency_structure=graph_def.dependency_structure,\n                    parent_handle=handle,\n                    resource_defs=resource_defs,\n                    asset_layer=asset_layer,\n                    node_input_source_assets=graph_def.node_input_source_assets,\n                )\n            ),\n        }\n\n        return node_config_field(fields, ignored=ignored)\n\n\ndef define_node_shape(\n    nodes: Sequence[Node],\n    ignored_nodes: Optional[Sequence[Node]],\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    asset_layer: AssetLayer,\n    node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]],\n    parent_handle: Optional[NodeHandle] = None,\n) -> Shape:\n    """Examples of what this method is used to generate the schema for:\n    1.\n        inputs: ...\n        ops:\n      >    op1: ...\n      >    op2: ...\n\n    2.\n        inputs:\n        ops:\n          graph1: ...\n            inputs: ...\n            ops:\n      >       op1: ...\n      >       inner_graph: ...\n\n\n    """\n    ignored_nodes = check.opt_sequence_param(ignored_nodes, "ignored_nodes", of_type=Node)\n\n    fields = {}\n    for node in nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=False,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n\n        if node_field:\n            fields[node.name] = node_field\n\n    for node in ignored_nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=True,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n        if node_field:\n            fields[node.name] = node_field\n\n    return Shape(fields)\n\n\ndef iterate_node_def_config_types(node_def: NodeDefinition) -> Iterator[ConfigType]:\n    if isinstance(node_def, OpDefinition):\n        if node_def.has_config_field:\n            yield from node_def.get_config_field().config_type.type_iterator()\n    elif isinstance(node_def, GraphDefinition):\n        for node in node_def.nodes:\n            yield from iterate_node_def_config_types(node.definition)\n\n    else:\n        check.invariant(f"Unexpected NodeDefinition type {type(node_def)}")\n\n\ndef _gather_all_schemas(node_defs: Sequence[NodeDefinition]) -> Iterator[ConfigType]:\n    dagster_types = construct_dagster_type_dictionary(node_defs)\n    for dagster_type in list(dagster_types.values()) + list(ALL_RUNTIME_BUILTINS):\n        if dagster_type.loader:\n            yield from dagster_type.loader.schema_type.type_iterator()\n\n\ndef _gather_all_config_types(\n    node_defs: Sequence[NodeDefinition], run_config_schema_type: ConfigType\n) -> Iterator[ConfigType]:\n    for node_def in node_defs:\n        yield from iterate_node_def_config_types(node_def)\n\n    yield from run_config_schema_type.type_iterator()\n\n\ndef construct_config_type_dictionary(\n    node_defs: Sequence[NodeDefinition],\n    run_config_schema_type: ConfigType,\n) -> Tuple[Mapping[str, ConfigType], Mapping[str, ConfigType]]:\n    type_dict_by_name = {t.given_name: t for t in ALL_CONFIG_BUILTINS if t.given_name}\n    type_dict_by_key = {t.key: t for t in ALL_CONFIG_BUILTINS}\n    all_types = list(_gather_all_config_types(node_defs, run_config_schema_type)) + list(\n        _gather_all_schemas(node_defs)\n    )\n\n    for config_type in all_types:\n        name = config_type.given_name\n        if name and name in type_dict_by_name:\n            if type(config_type) is not type(type_dict_by_name[name]):\n                raise DagsterInvalidDefinitionError(\n                    "Type names must be unique. You have constructed two different "\n                    f'instances of types with the same name "{name}".'\n                )\n        elif name:\n            type_dict_by_name[name] = config_type\n\n        type_dict_by_key[config_type.key] = config_type\n\n    return type_dict_by_name, type_dict_by_key\n\n\ndef _convert_config_classes_inner(configs: Any) -> Any:\n    if not isinstance(configs, dict):\n        return configs\n\n    return {\n        k: (\n            {"config": v._convert_to_config_dictionary()}  # noqa: SLF001\n            if isinstance(v, Config)\n            else _convert_config_classes_inner(v)\n        )\n        for k, v in configs.items()\n    }\n\n\ndef _convert_config_classes(configs: Dict[str, Any]) -> Dict[str, Any]:\n    return _convert_config_classes_inner(configs)\n\n\n
[docs]class RunConfig:\n """Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\n for op and asset config and resources and converts them under the hood to the appropriate config dictionaries.\n\n Example usage:\n\n .. code-block:: python\n\n class MyAssetConfig(Config):\n a_str: str\n\n @asset\n def my_asset(config: MyAssetConfig):\n assert config.a_str == "foo"\n\n materialize(\n [my_asset],\n run_config=RunConfig(\n ops={"my_asset": MyAssetConfig(a_str="foo")}\n )\n )\n\n """\n\n def __init__(\n self,\n ops: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n loggers: Optional[Dict[str, Any]] = None,\n execution: Optional[Dict[str, Any]] = None,\n ):\n self.ops = check.opt_dict_param(ops, "ops")\n self.resources = check.opt_dict_param(resources, "resources")\n self.loggers = check.opt_dict_param(loggers, "loggers")\n self.execution = check.opt_dict_param(execution, "execution")\n\n def to_config_dict(self):\n return {\n "loggers": self.loggers,\n "resources": _convert_config_classes(self.resources),\n "ops": _convert_config_classes(self.ops),\n "execution": self.execution,\n }
\n\n\nCoercibleToRunConfig: TypeAlias = Union[Dict[str, Any], RunConfig]\n\nT = TypeVar("T")\n\n\ndef convert_config_input(inp: Union[CoercibleToRunConfig, T]) -> Union[T, Mapping[str, Any]]:\n if isinstance(inp, RunConfig):\n return inp.to_config_dict()\n else:\n return inp\n
", "current_page_name": "_modules/dagster/_core/definitions/run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_config"}, "run_request": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_request

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG\nfrom dagster._serdes.serdes import whitelist_for_serdes\nfrom dagster._utils.error import SerializableErrorInfo\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\n@whitelist_for_serdes(old_storage_names={"JobType"})\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n    AUTO_MATERIALIZE = "AUTO_MATERIALIZE"\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", PublicAttr[Optional[str]])])):\n """Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in the Dagster UI for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AddDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(AddDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass DeleteDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(DeleteDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", PublicAttr[Optional[str]]),\n ("run_config", PublicAttr[Mapping[str, Any]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("asset_selection", PublicAttr[Optional[Sequence[AssetKey]]]),\n ("stale_assets_only", PublicAttr[bool]),\n ("partition_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n tags (Optional[Dict[str, Any]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n asset_selection (Optional[Sequence[AssetKey]]): A sequence of AssetKeys that should be\n launched with this run.\n stale_assets_only (bool): Set to true to further narrow the asset\n selection to stale assets. If passed without an asset selection, all stale assets in the\n job will be materialized. If the job does not materialize assets, this flag is ignored.\n partition_key (Optional[str]): The partition key for this run request.\n """\n\n def __new__(\n cls,\n run_key: Optional[str] = None,\n run_config: Optional[Union["RunConfig", Mapping[str, Any]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n job_name: Optional[str] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n stale_assets_only: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.run_config import convert_config_input\n\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n ),\n tags=validate_tags(check.opt_mapping_param(tags, "tags", key_type=str)),\n job_name=check.opt_str_param(job_name, "job_name"),\n asset_selection=check.opt_nullable_sequence_param(\n asset_selection, "asset_selection", of_type=AssetKey\n ),\n stale_assets_only=check.bool_param(stale_assets_only, "stale_assets_only"),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n )\n\n def with_replaced_attrs(self, **kwargs: Any) -> "RunRequest":\n fields = self._asdict()\n for k in fields.keys():\n if k in kwargs:\n fields[k] = kwargs[k]\n return RunRequest(**fields)\n\n def with_resolved_tags_and_config(\n self,\n target_definition: Union["JobDefinition", "UnresolvedAssetJobDefinition"],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "RunRequest":\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.definitions.partition import (\n PartitionedConfig,\n PartitionsDefinition,\n )\n\n if self.partition_key is None:\n check.failed(\n "Cannot resolve partition for run request without partition key",\n )\n\n partitions_def = target_definition.partitions_def\n if partitions_def is None:\n check.failed(\n "Cannot resolve partition for run request when target job"\n f" '{target_definition.name}' is unpartitioned.",\n )\n partitions_def = cast(PartitionsDefinition, partitions_def)\n\n partitioned_config = (\n target_definition.partitioned_config\n if isinstance(target_definition, JobDefinition)\n else PartitionedConfig.from_flexible_config(target_definition.config, partitions_def)\n )\n if partitioned_config is None:\n check.failed(\n "Cannot resolve partition for run request on unpartitioned job",\n )\n\n _check_valid_partition_key_after_dynamic_partitions_requests(\n self.partition_key,\n partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n tags = {\n **(self.tags or {}),\n **partitioned_config.get_tags_for_partition_key(\n self.partition_key,\n job_name=target_definition.name,\n ),\n }\n\n return self.with_replaced_attrs(\n run_config=(\n self.run_config\n if self.run_config\n else partitioned_config.get_run_config_for_partition_key(self.partition_key)\n ),\n tags=tags,\n )\n\n def has_resolved_partition(self) -> bool:\n # Backcompat run requests yielded via `run_request_for_partition` already have resolved\n # partitioning\n return self.tags.get(PARTITION_NAME_TAG) is not None if self.partition_key else True
\n\n\ndef _check_valid_partition_key_after_dynamic_partitions_requests(\n partition_key: str,\n partitions_def: "PartitionsDefinition",\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\n from dagster._core.definitions.partition import (\n DynamicPartitionsDefinition,\n )\n\n if isinstance(partitions_def, MultiPartitionsDefinition):\n multipartition_key = partitions_def.get_partition_key_from_str(partition_key)\n\n for dimension in partitions_def.partitions_defs:\n _check_valid_partition_key_after_dynamic_partitions_requests(\n multipartition_key.keys_by_dimension[dimension.name],\n dimension.partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n elif isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name:\n if not dynamic_partitions_store:\n check.failed(\n "Cannot resolve partition for run request on dynamic partitions without"\n " dynamic_partitions_store"\n )\n\n add_partition_keys: Set[str] = set()\n delete_partition_keys: Set[str] = set()\n for req in dynamic_partitions_requests:\n if isinstance(req, AddDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n add_partition_keys.update(set(req.partition_keys))\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n delete_partition_keys.update(set(req.partition_keys))\n\n partition_keys_after_requests_resolved = (\n set(\n dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=partitions_def.name\n )\n )\n | add_partition_keys\n ) - delete_partition_keys\n\n if partition_key not in partition_keys_after_requests_resolved:\n check.failed(\n f"Dynamic partition key {partition_key} for partitions def"\n f" '{partitions_def.name}' is invalid. After dynamic partitions requests are"\n " applied, it does not exist in the set of valid partition keys."\n )\n\n else:\n partitions_def.validate_partition_key(\n partition_key,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n\n@whitelist_for_serdes(\n storage_name="PipelineRunReaction",\n storage_field_names={\n "dagster_run": "pipeline_run",\n },\n)\nclass DagsterRunReaction(\n NamedTuple(\n "_DagsterRunReaction",\n [\n ("dagster_run", Optional[DagsterRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[DagsterRunStatus]),\n ],\n )\n):\n """Represents a request that reacts to an existing dagster run. If success, it will report logs\n back to the run.\n\n Attributes:\n dagster_run (Optional[DagsterRun]): The dagster run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[DagsterRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n dagster_run: Optional[DagsterRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[DagsterRunStatus] = None,\n ):\n return super(DagsterRunReaction, cls).__new__(\n cls,\n dagster_run=check.opt_inst_param(dagster_run, "dagster_run", DagsterRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", DagsterRunStatus),\n )\n\n\n
[docs]@experimental_param(\n param="asset_events", additional_warn_text="Runless asset events are experimental"\n)\nclass SensorResult(\n NamedTuple(\n "_SensorResult",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_reason", Optional[SkipReason]),\n ("cursor", Optional[str]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n List[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n """The result of a sensor evaluation.\n\n Attributes:\n run_requests (Optional[Sequence[RunRequest]]): A list\n of run requests to be executed.\n skip_reason (Optional[Union[str, SkipReason]]): A skip message indicating why sensor\n evaluation was skipped.\n cursor (Optional[str]): The cursor value for this sensor, which will be provided on the\n context for the next sensor evaluation.\n dynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,\n AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\n partition addition and deletion. Run requests will be evaluated using the state of the\n partitions with these changes applied.\n asset_events (Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]): (Experimental) A\n list of materializations, observations, and asset check evaluations that the system\n will persist on your behalf at the end of sensor evaluation. These events will be not\n be associated with any particular run, but will be queryable and viewable in the asset catalog.\n\n\n """\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_reason: Optional[Union[str, SkipReason]] = None,\n cursor: Optional[str] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]\n ] = None,\n ):\n if skip_reason and len(run_requests if run_requests else []) > 0:\n check.failed(\n "Expected a single skip reason or one or more run requests: received values for "\n "both run_requests and skip_reason"\n )\n\n skip_reason = check.opt_inst_param(skip_reason, "skip_reason", (SkipReason, str))\n if isinstance(skip_reason, str):\n skip_reason = SkipReason(skip_reason)\n\n return super(SensorResult, cls).__new__(\n cls,\n run_requests=check.opt_sequence_param(run_requests, "run_requests", RunRequest),\n skip_reason=skip_reason,\n cursor=check.opt_str_param(cursor, "cursor"),\n dynamic_partitions_requests=check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n ),\n asset_events=list(\n check.opt_sequence_param(\n asset_events,\n "asset_check_evaluations",\n (AssetObservation, AssetMaterialization, AssetCheckEvaluation),\n )\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_request", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_status_sensor_definition

\nimport functools\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvariantViolationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent, DagsterEventType\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus, RunsFilter\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .sensor_definition import (\n    DagsterRunReaction,\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    RunRequest,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorResult,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.resource_definition import ResourceDefinition\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nRunStatusSensorEvaluationFunction: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\nRunFailureSensorEvaluationFn: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\n\n\n@whitelist_for_serdes(old_storage_names={"PipelineSensorCursor"})\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_value(json_str, RunStatusSensorCursor)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "RunStatusSensorCursor":\n        return deserialize_value(json_str, RunStatusSensorCursor)\n\n\n
[docs]class RunStatusSensorContext:\n """The ``context`` object available to a decorated function of ``run_status_sensor``."""\n\n def __init__(\n self,\n sensor_name,\n dagster_run,\n dagster_event,\n instance,\n context: Optional[\n SensorEvaluationContext\n ] = None, # deprecated arg, but we need to keep it for backcompat\n resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n logger: Optional[logging.Logger] = None,\n partition_key: Optional[str] = None,\n _resources: Optional[Resources] = None,\n _cm_scope_entered: bool = False,\n ) -> None:\n self._exit_stack = ExitStack()\n self._sensor_name = check.str_param(sensor_name, "sensor_name")\n self._dagster_run = check.inst_param(dagster_run, "dagster_run", DagsterRun)\n self._dagster_event = check.inst_param(dagster_event, "dagster_event", DagsterEvent)\n self._instance = check.inst_param(instance, "instance", DagsterInstance)\n self._logger: Optional[logging.Logger] = logger or (context.log if context else None)\n self._partition_key = check.opt_str_param(partition_key, "partition_key")\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resource_defs\n self._resources = _resources\n self._cm_scope_entered = _cm_scope_entered\n\n def for_run_failure(self) -> "RunFailureSensorContext":\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self._sensor_name,\n dagster_run=self._dagster_run,\n dagster_event=self._dagster_event,\n instance=self._instance,\n logger=self._logger,\n partition_key=self._partition_key,\n resource_defs=self._resource_defs,\n _resources=self._resources,\n _cm_scope_entered=self._cm_scope_entered,\n )\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @property\n def resources(self) -> Resources:\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n instance = self.instance if self._instance else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def sensor_name(self) -> str:\n """The name of the sensor."""\n return self._sensor_name\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """The run of the job."""\n return self._dagster_run\n\n @public\n @property\n def dagster_event(self) -> DagsterEvent:\n """The event associated with the job run status."""\n return self._dagster_event\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """The current instance."""\n return self._instance\n\n @public\n @property\n def log(self) -> logging.Logger:\n """The logger for the current sensor evaluation."""\n if not self._logger:\n self._logger = InstigationLogger()\n\n return self._logger\n\n @public\n @property\n def partition_key(self) -> Optional[str]:\n """Optional[str]: The partition key of the relevant run."""\n return self._partition_key\n\n def __enter__(self) -> "RunStatusSensorContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the failed run.\n """\n\n @public\n @property\n def failure_event(self) -> DagsterEvent:\n """The run failure event.\n\n If the run failed because of an error inside a step, get_step_failure_events will have more\n details on the step failure.\n """\n return self.dagster_event\n\n
[docs] @public\n def get_step_failure_events(self) -> Sequence[DagsterEvent]:\n """The step failure event for each step in the run that failed.\n\n Examples:\n .. code-block:: python\n\n error_strings_by_step_key = {\n # includes the stack trace\n event.step_key: event.event_specific_data.error.to_string()\n for event in context.get_step_failure_events()\n }\n """\n records = self.instance.get_records_for_run(\n run_id=self.dagster_run.run_id, of_type=DagsterEventType.STEP_FAILURE\n ).records\n return [cast(DagsterEvent, record.event_log_entry.dagster_event) for record in records]
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n context: Optional[SensorEvaluationContext] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n) -> RunStatusSensorContext:\n """Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n resources (Optional[Mapping[str, object]]): A dictionary of resources to be made available\n to the sensor.\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n resource_defs=wrap_resources_for_execution(resources),\n logger=context.log if context else None,\n partition_key=partition_key,\n )
\n\n\n@overload\ndef run_failure_sensor(\n name: RunFailureSensorEvaluationFn,\n) -> SensorDefinition: ...\n\n\n@overload\ndef run_failure_sensor(\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]: ...\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_failure_sensor(\n name: Optional[Union[RunFailureSensorEvaluationFn, str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Union[SensorDefinition, Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]]:\n """Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this failure sensor.\n Defaults to None, which means the alert will be sent when any job in the current\n repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\n monitored by this failure sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunFailureSensorEvaluationFn,\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_status_sensor(\n run_status=DagsterRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n @functools.wraps(fn)\n def _run_failure_sensor(*args, **kwargs) -> Any:\n args_modified = [\n arg.for_run_failure() if isinstance(arg, RunStatusSensorContext) else arg\n for arg in args\n ]\n kwargs_modified = {\n k: v.for_run_failure() if isinstance(v, RunStatusSensorContext) else v\n for k, v in kwargs.items()\n }\n return fn(*args_modified, **kwargs_modified)\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """Define a sensor that reacts to a given status of job execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n run_status (DagsterRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this sensor. Defaults to\n None, which means the alert will be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def __init__(\n self,\n name: str,\n run_status: DagsterRunStatus,\n run_status_sensor_fn: RunStatusSensorEvaluationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._core.definitions.selector import (\n CodeLocationSelector,\n JobSelector,\n RepositorySelector,\n )\n from dagster._core.event_api import RunShardedEventsCursor\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n check.str_param(name, "name")\n check.inst_param(run_status, "run_status", DagsterRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(\n monitored_jobs,\n "monitored_jobs",\n (\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n RepositorySelector,\n JobSelector,\n CodeLocationSelector,\n ),\n )\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(run_status_sensor_fn)}\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n # coerce CodeLocationSelectors to RepositorySelectors with repo name "__repository__"\n monitored_jobs = [\n job.to_repository_selector() if isinstance(job, CodeLocationSelector) else job\n for job in (monitored_jobs or [])\n ]\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n event_type = PIPELINE_RUN_STATUS_TO_EVENT_TYPE[run_status]\n\n # split monitored_jobs into external repos, external jobs, and jobs in the current repo\n other_repos = (\n [x for x in monitored_jobs if isinstance(x, RepositorySelector)]\n if monitored_jobs\n else []\n )\n\n other_repo_jobs = (\n [x for x in monitored_jobs if isinstance(x, JobSelector)] if monitored_jobs else []\n )\n\n current_repo_jobs = (\n [x for x in monitored_jobs if not isinstance(x, (JobSelector, RepositorySelector))]\n if monitored_jobs\n else []\n )\n\n def _wrapped_fn(\n context: SensorEvaluationContext,\n ) -> Iterator[Union[RunRequest, SkipReason, DagsterRunReaction, SensorResult]]:\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(\n EventRecordsFilter(event_type=event_type), ascending=False, limit=1\n )\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=event_type,\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n dagster_run = run_records[0].dagster_run\n update_timestamp = run_records[0].update_timestamp\n\n job_match = False\n\n # if monitor_all_repositories is provided, then we want to run the sensor for all jobs in all repositories\n if monitor_all_repositories:\n job_match = True\n\n # check if the run is in the current repository and (if provided) one of jobs specified in monitored_jobs\n if (\n not job_match\n and\n # the job has a repository (not manually executed)\n dagster_run.external_job_origin\n and\n # the job belongs to the current repository\n dagster_run.external_job_origin.external_repository_origin.repository_name\n == context.repository_name\n ):\n if monitored_jobs:\n if dagster_run.job_name in map(lambda x: x.name, current_repo_jobs):\n job_match = True\n else:\n job_match = True\n\n if not job_match:\n # check if the run is one of the jobs specified by JobSelector or RepositorySelector (ie in another repo)\n # make a JobSelector for the run in question\n external_repository_origin = check.not_none(\n dagster_run.external_job_origin\n ).external_repository_origin\n run_job_selector = JobSelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n job_name=dagster_run.job_name,\n )\n if run_job_selector in other_repo_jobs:\n job_match = True\n\n # make a RepositorySelector for the run in question\n run_repo_selector = RepositorySelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n )\n if run_repo_selector in other_repos:\n job_match = True\n\n if not job_match:\n # the run in question doesn't match any of the criteria for we advance the cursor and move on\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n try:\n with RunStatusSensorContext(\n sensor_name=name,\n dagster_run=dagster_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n resource_defs=context.resource_defs,\n logger=context.log,\n partition_key=dagster_run.tags.get("dagster/partition"),\n ) as sensor_context, user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n context_param_name = get_context_param_name(run_status_sensor_fn)\n context_param = (\n {context_param_name: sensor_context} if context_param_name else {}\n )\n\n sensor_return = run_status_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n\n if sensor_return is not None:\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=update_timestamp.isoformat(),\n ).to_json()\n )\n\n if isinstance(sensor_return, SensorResult):\n if sensor_return.cursor:\n raise DagsterInvariantViolationError(\n f"Error in run status sensor {name}: Sensor returned a"\n " SensorResult with a cursor value. The cursor is managed"\n " by the sensor and should not be modified by a user."\n )\n yield sensor_return\n elif isinstance(\n sensor_return,\n (RunRequest, SkipReason, DagsterRunReaction),\n ):\n yield sensor_return\n else:\n yield from sensor_return\n return\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield DagsterRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield DagsterRunReaction(\n dagster_run=dagster_run,\n run_status=run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n job=request_job,\n jobs=request_jobs,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name = get_context_param_name(self._run_status_sensor_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._run_status_sensor_fn,\n args,\n kwargs,\n context_type=RunStatusSensorContext,\n )\n context_param = {context_param_name: context} if context_param_name and context else {}\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n return self._run_status_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.RUN_STATUS
\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_status_sensor(\n run_status: DagsterRunStatus,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunStatusSensorEvaluationFunction], RunStatusSensorDefinition,]:\n """Creates a sensor that reacts to a given status of job execution, where the decorated\n function will be run when a job is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n run_status (DagsterRunStatus): The status of run execution which will be\n monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\n be sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\n RepositorySelector or JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the Dagster instance.\n If set to True, an error will be raised if you also specify monitored_jobs or job_selection.\n Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\n monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository matches the requested run_status.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job that should be\n executed if a RunRequest is yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunStatusSensorEvaluationFunction,\n ) -> RunStatusSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n if jobs and monitor_all_repositories:\n DagsterInvalidDefinitionError(\n "Cannot specify both monitor_all_repositories and"\n f" {'monitored_jobs' if monitored_jobs else 'job_selection'}."\n )\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n run_status=run_status,\n run_status_sensor_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_status_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.schedule_definition

\nimport copy\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, ensure_gen\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom ..decorator_utils import has_at_least_one_parameter\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.dagster_run import DagsterRun\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name, validate_tags\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\nT = TypeVar("T")\n\nRunConfig: TypeAlias = Mapping[str, Any]\nRunRequestIterator: TypeAlias = Iterator[Union[RunRequest, SkipReason]]\n\nScheduleEvaluationFunctionReturn: TypeAlias = Union[\n    RunRequest, SkipReason, RunConfig, RunRequestIterator, Sequence[RunRequest]\n]\nRawScheduleEvaluationFunction: TypeAlias = Callable[..., ScheduleEvaluationFunctionReturn]\n\nScheduleRunConfigFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], RunConfig],\n    Callable[[], RunConfig],\n]\n\nScheduleTagsFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], Mapping[str, str]]\nScheduleShouldExecuteFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], bool]\nScheduleExecutionFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], Any],\n    "DecoratedScheduleFunction",\n]\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\ndef get_or_create_schedule_context(\n    fn: Callable, *args: Any, **kwargs: Any\n) -> "ScheduleEvaluationContext":\n    """Based on the passed resource function and the arguments passed to it, returns the\n    user-passed ScheduleEvaluationContext or creates one if it is not passed.\n\n    Raises an exception if the user passes more than one argument or if the user-provided\n    function requires a context parameter but none is passed.\n    """\n    from dagster._config.pythonic_config import is_coercible_to_resource\n    from dagster._core.definitions.sensor_definition import get_context_param_name\n\n    context_param_name = get_context_param_name(fn)\n\n    kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n    if len(args) + len(kwarg_keys_non_resource) > 1:\n        raise DagsterInvalidInvocationError(\n            "Schedule invocation received multiple non-resource arguments. Only a first "\n            "positional context parameter should be provided when invoking."\n        )\n\n    if any(is_coercible_to_resource(arg) for arg in args):\n        raise DagsterInvalidInvocationError(\n            "If directly invoking a schedule, you may not provide resources as"\n            " positional arguments, only as keyword arguments."\n        )\n\n    context: Optional[ScheduleEvaluationContext] = None\n\n    if len(args) > 0:\n        context = check.opt_inst(args[0], ScheduleEvaluationContext)\n    elif len(kwargs) > 0:\n        if context_param_name and context_param_name not in kwargs:\n            raise DagsterInvalidInvocationError(\n                f"Schedule invocation expected argument '{context_param_name}'."\n            )\n        context = check.opt_inst(\n            kwargs.get(context_param_name or "context"), ScheduleEvaluationContext\n        )\n    elif context_param_name:\n        # If the context parameter is present but no value was provided, we error\n        raise DagsterInvalidInvocationError(\n            "Schedule evaluation function expected context argument, but no context argument "\n            "was provided when invoking."\n        )\n\n    context = context or build_schedule_context()\n    resource_args_from_kwargs = {}\n\n    resource_args = {param.name for param in get_resource_args(fn)}\n    for resource_arg in resource_args:\n        if resource_arg in kwargs:\n            resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n    if resource_args_from_kwargs:\n        return context.merge_resources(resource_args_from_kwargs)\n\n    return context\n\n\n
[docs]class ScheduleEvaluationContext:\n """The context object available as the first argument various functions defined on a :py:class:`dagster.ScheduleDefinition`.\n\n A `ScheduleEvaluationContext` object is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Users should not instantiate this object directly. To construct a `ScheduleEvaluationContext` for testing purposes, use :py:func:`dagster.build_schedule_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import schedule, ScheduleEvaluationContext\n\n @schedule\n def the_schedule(context: ScheduleEvaluationContext):\n ...\n\n """\n\n __slots__ = [\n "_instance_ref",\n "_scheduled_execution_time",\n "_exit_stack",\n "_instance",\n "_log_key",\n "_logger",\n "_repository_name",\n "_resource_defs",\n "_schedule_name",\n "_resources_cm",\n "_resources",\n "_cm_scope_entered",\n "_repository_def",\n ]\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n scheduled_execution_time: Optional[datetime],\n repository_name: Optional[str] = None,\n schedule_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n ):\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n self._log_key = (\n [\n repository_name,\n schedule_name,\n scheduled_execution_time.strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and schedule_name and scheduled_execution_time\n else None\n )\n self._logger = None\n self._repository_name = repository_name\n self._schedule_name = schedule_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n self._repository_def = check.opt_inst_param(\n repository_def, "repository_def", RepositoryDefinition\n )\n\n def __enter__(self) -> "ScheduleEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @public\n @property\n def resources(self) -> Resources:\n """Mapping of resource key to resource definition to be made available\n during schedule execution.\n """\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on schedules they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_sensor_context(...) as context:`"\n )\n\n return self._resources\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "ScheduleEvaluationContext":\n """Merge the specified resources into this context.\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return ScheduleEvaluationContext(\n instance_ref=self._instance_ref,\n scheduled_execution_time=self._scheduled_execution_time,\n repository_name=self._repository_name,\n schedule_name=self._schedule_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n repository_def=self._repository_def,\n )\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n """The serialized instance configured to run the schedule."""\n return self._instance_ref\n\n @public\n @property\n def scheduled_execution_time(self) -> datetime:\n """The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n """\n if self._scheduled_execution_time is None:\n check.failed(\n "Attempting to access scheduled_execution_time, but no scheduled_execution_time was"\n " set on this context"\n )\n\n return self._scheduled_execution_time\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n return cast(InstigationLogger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key\n\n @property\n def repository_def(self) -> "RepositoryDefinition":\n if not self._repository_def:\n raise DagsterInvariantViolationError(\n "Attempted to access repository_def, but no repository_def was provided."\n )\n return self._repository_def
\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n """Wrapper around the decorated schedule function. Keeps track of both to better support the\n optimal return value for direct invocation of the evaluation function.\n """\n\n decorated_fn: RawScheduleEvaluationFunction\n wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]\n has_context_arg: bool\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None,\n scheduled_execution_time: Optional[datetime] = None,\n resources: Optional[Mapping[str, object]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n .. code-block:: python\n\n context = build_schedule_context(instance)\n\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n\n return ScheduleEvaluationContext(\n instance_ref=(\n instance_ref\n if instance_ref\n else instance.get_ref() if instance and instance.is_persistent else None\n ),\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n resources=wrap_resources_for_execution(resources),\n repository_def=repository_def,\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(\n NamedTuple(\n "_ScheduleExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("captured_log_key", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(ScheduleExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n captured_log_key=captured_log_key,\n )\n\n\ndef validate_and_get_schedule_resource_dict(\n resources: Resources, schedule_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by schedule '{schedule_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\n
[docs]@deprecated_param(\n param="environment_vars",\n breaking_version="2.0",\n additional_warn_text=(\n "It is no longer necessary. Schedules will have access to all environment variables set in"\n " the containing environment, and can safely be deleted."\n ),\n)\nclass ScheduleDefinition(IHasInternalInit):\n """Define a schedule that targets a job.\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Mapping]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "ScheduleDefinition":\n """Returns a copy of this schedule with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return ScheduleDefinition.dagster_internal_init(\n name=self.name,\n cron_schedule=self._cron_schedule,\n job_name=self.job_name,\n execution_timezone=self.execution_timezone,\n execution_fn=self._execution_fn,\n description=self.description,\n job=new_job,\n default_status=self.default_status,\n environment_vars=self._environment_vars,\n required_resource_keys=self._raw_required_resource_keys,\n run_config=None, # run_config, tags, should_execute encapsulated in execution_fn\n run_config_fn=None,\n tags=None,\n tags_fn=None,\n should_execute=None,\n )\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n cron_schedule: Optional[Union[str, Sequence[str]]] = None,\n job_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[ScheduleRunConfigFunction] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[ScheduleTagsFunction] = None,\n should_execute: Optional[ScheduleShouldExecuteFunction] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[ScheduleExecutionFunction] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._cron_schedule = check.inst_param(cron_schedule, "cron_schedule", (str, Sequence))\n if not isinstance(self._cron_schedule, str):\n check.sequence_param(self._cron_schedule, "cron_schedule", of_type=str) # type: ignore\n\n if not is_valid_cron_schedule(self._cron_schedule): # type: ignore\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n\n if name:\n self._name = check_valid_name(name)\n elif job_name:\n self._name = job_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_mapping_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[Union[Callable[..., Any], DecoratedScheduleFunction]] = (\n None\n )\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n\n def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:\n return check.opt_dict_param(run_config, "run_config")\n\n self._run_config_fn = check.opt_callable_param(\n run_config_fn, "run_config_fn", default=_default_run_config_fn\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n tags = validate_tags(tags, allow_reserved_tags=False)\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(\n tags_fn, "tags_fn", default=lambda _context: cast(Mapping[str, str], {})\n )\n self._tags_fn = tags_fn\n self._tags = tags\n\n self._should_execute: ScheduleShouldExecuteFunction = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n # Several type-ignores are present in this function to work around bugs in mypy\n # inference.\n def _execution_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of should_execute for schedule {name}"\n ),\n ):\n if not self._should_execute(context):\n yield SkipReason(f"should_execute function for {name} returned false.")\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of run_config_fn for schedule {name}"\n ),\n ):\n _run_config_fn = check.not_none(self._run_config_fn)\n evaluated_run_config = copy.deepcopy(\n _run_config_fn(context)\n if has_at_least_one_parameter(_run_config_fn)\n else _run_config_fn() # type: ignore # (strict type guard)\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(self._execution_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n f"Invalid execution timezone {self._execution_timezone} for {name}"\n ) from e\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n resource_arg_names: Set[str] = (\n {arg.name for arg in get_resource_args(self._execution_fn.decorated_fn)}\n if isinstance(self._execution_fn, DecoratedScheduleFunction)\n else set()\n )\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @schedule decorator and as arguments to"\n " the decorated function",\n )\n\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n cron_schedule: Optional[Union[str, Sequence[str]]],\n job_name: Optional[str],\n run_config: Optional[Any],\n run_config_fn: Optional[ScheduleRunConfigFunction],\n tags: Optional[Mapping[str, str]],\n tags_fn: Optional[ScheduleTagsFunction],\n should_execute: Optional[ScheduleShouldExecuteFunction],\n environment_vars: Optional[Mapping[str, str]],\n execution_timezone: Optional[str],\n execution_fn: Optional[ScheduleExecutionFunction],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n default_status: DefaultScheduleStatus,\n required_resource_keys: Optional[Set[str]],\n ) -> "ScheduleDefinition":\n return ScheduleDefinition(\n name=name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n run_config=run_config,\n run_config_fn=run_config_fn,\n tags=tags,\n tags_fn=tags_fn,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> ScheduleEvaluationFunctionReturn:\n from dagster._core.definitions.sensor_definition import get_context_param_name\n\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n\n context_param_name = get_context_param_name(self._execution_fn.decorated_fn)\n context = get_or_create_schedule_context(self._execution_fn.decorated_fn, *args, **kwargs)\n context_param = {context_param_name: context} if context_param_name else {}\n\n resources = validate_and_get_schedule_resource_dict(\n context.resources, self._name, self._required_resource_keys\n )\n result = self._execution_fn.decorated_fn(**context_param, **resources)\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the schedule."""\n return self._name\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the job targeted by this schedule."""\n return self._target.job_name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this schedule."""\n return self._description\n\n @public\n @property\n def cron_schedule(self) -> Union[str, Sequence[str]]:\n """Union[str, Sequence[str]]: The cron schedule representing when this schedule will be evaluated."""\n return self._cron_schedule # type: ignore\n\n @public\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Setting this property no longer has any effect.",\n )\n @property\n def environment_vars(self) -> Mapping[str, str]:\n """Mapping[str, str]: Environment variables to export to the cron schedule."""\n return self._environment_vars\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this schedule."""\n return self._required_resource_keys\n\n @public\n @property\n def execution_timezone(self) -> Optional[str]:\n """Optional[str]: The timezone in which this schedule will be evaluated."""\n return self._execution_timezone\n\n @public\n @property\n def job(self) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if isinstance(self._target, DirectTarget):\n return self._target.target\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n from dagster._core.definitions.partition import CachingDynamicPartitionsLoader\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n execution_fn: Callable[..., "ScheduleEvaluationFunctionReturn"]\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(\n Callable[..., "ScheduleEvaluationFunctionReturn"],\n self._execution_fn,\n )\n\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest] = []\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = check.inst(result[0], (SkipReason, RunRequest))\n if isinstance(item, RunRequest):\n run_requests = [item]\n skip_message = None\n elif isinstance(item, SkipReason):\n run_requests = []\n skip_message = item.skip_message\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest))\n check.invariant(\n not any(not request.run_key for request in result),\n "Schedules that return multiple RunRequests must specify a run_key in each"\n " RunRequest",\n )\n run_requests = result\n skip_message = None\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # clone all the run requests with resolved tags and config\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.partition_key and not run_request.has_resolved_partition():\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_schedule_context when yielding"\n " partitioned run requests"\n )\n\n scheduled_target = context.repository_def.get_job(self._target.job_name)\n resolved_request = run_request.with_resolved_tags_and_config(\n target_definition=scheduled_target,\n dynamic_partitions_requests=[],\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n else:\n resolved_request = run_request\n\n resolved_run_requests.append(\n resolved_request.with_replaced_attrs(\n tags=merge_dicts(resolved_request.tags, DagsterRun.tags_for_schedule(self))\n )\n )\n\n return ScheduleExecutionData(\n run_requests=resolved_run_requests,\n skip_message=skip_message,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n @property\n def targets_unresolved_asset_job(self) -> bool:\n return self.has_loadable_target() and isinstance(\n self.load_target(), UnresolvedAssetJobDefinition\n )\n\n def load_target(\n self,\n ) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @public\n @property\n def default_status(self) -> DefaultScheduleStatus:\n """DefaultScheduleStatus: The default status for this schedule when it is first loaded in\n a code location.\n """\n return self._default_status
\n
", "current_page_name": "_modules/dagster/_core/definitions/schedule_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.schedule_definition"}, "selector": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.selector

\nfrom typing import AbstractSet, Iterable, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.repository_definition import SINGLETON_REPOSITORY_NAME\nfrom dagster._serdes import create_snapshot_id, whitelist_for_serdes\n\n\nclass JobSubsetSelector(\n    NamedTuple(\n        "_JobSubsetSelector",\n        [\n            ("location_name", str),\n            ("repository_name", str),\n            ("job_name", str),\n            ("op_selection", Optional[Sequence[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    )\n):\n    """The information needed to resolve a job within a host process."""\n\n    def __new__(\n        cls,\n        location_name: str,\n        repository_name: str,\n        job_name: str,\n        op_selection: Optional[Sequence[str]],\n        asset_selection: Optional[Iterable[AssetKey]] = None,\n        asset_check_selection: Optional[Iterable[AssetCheckKey]] = None,\n    ):\n        asset_selection = set(asset_selection) if asset_selection else None\n        asset_check_selection = (\n            set(asset_check_selection) if asset_check_selection is not None else None\n        )\n        return super(JobSubsetSelector, cls).__new__(\n            cls,\n            location_name=check.str_param(location_name, "location_name"),\n            repository_name=check.str_param(repository_name, "repository_name"),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_sequence_param(op_selection, "op_selection", str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def to_graphql_input(self):\n        return {\n            "repositoryLocationName": self.location_name,\n            "repositoryName": self.repository_name,\n            "pipelineName": self.job_name,\n            "solidSelection": self.op_selection,\n        }\n\n    def with_op_selection(self, op_selection: Optional[Sequence[str]]) -> Self:\n        check.invariant(\n            self.op_selection is None,\n            f"Can not invoke with_op_selection when op_selection={self.op_selection} is"\n            " already set",\n        )\n        return JobSubsetSelector(\n            self.location_name, self.repository_name, self.job_name, op_selection\n        )\n\n\n
[docs]@whitelist_for_serdes\nclass JobSelector(\n NamedTuple(\n "_JobSelector", [("location_name", str), ("repository_name", str), ("job_name", str)]\n )\n):\n def __new__(\n cls,\n location_name: str,\n repository_name: Optional[str] = None,\n job_name: Optional[str] = None,\n ):\n return super(JobSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.opt_str_param(\n repository_name,\n "repository_name",\n default=SINGLETON_REPOSITORY_NAME,\n ),\n job_name=check.str_param(\n job_name,\n "job_name",\n "Must provide job_name argument even though it is marked as optional in the "\n "function signature. repository_name, a truly optional parameter, is before "\n "that argument and actually optional. Use of keyword arguments is "\n "recommended to avoid confusion.",\n ),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "jobName": self.job_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return JobSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n job_name=graphql_data["jobName"],\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RepositorySelector(\n NamedTuple("_RepositorySelector", [("location_name", str), ("repository_name", str)])\n):\n def __new__(cls, location_name: str, repository_name: str):\n return super(RepositorySelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return RepositorySelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n )
\n\n\nclass CodeLocationSelector(NamedTuple("_CodeLocationSelector", [("location_name", str)])):\n def __new__(cls, location_name: str):\n return super(CodeLocationSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n )\n\n def to_repository_selector(self) -> RepositorySelector:\n return RepositorySelector(\n location_name=self.location_name, repository_name=SINGLETON_REPOSITORY_NAME\n )\n\n\nclass ScheduleSelector(\n NamedTuple(\n "_ScheduleSelector",\n [("location_name", str), ("repository_name", str), ("schedule_name", str)],\n )\n):\n def __new__(cls, location_name: str, repository_name: str, schedule_name: str):\n return super(ScheduleSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n schedule_name=check.str_param(schedule_name, "schedule_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "scheduleName": self.schedule_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ScheduleSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n schedule_name=graphql_data["scheduleName"],\n )\n\n\nclass ResourceSelector(NamedTuple):\n location_name: str\n repository_name: str\n resource_name: str\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "resourceName": self.resource_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ResourceSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n resource_name=graphql_data["resourceName"],\n )\n\n\nclass SensorSelector(\n NamedTuple(\n "_SensorSelector", [("location_name", str), ("repository_name", str), ("sensor_name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, sensor_name: str):\n return super(SensorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "sensorName": self.sensor_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return SensorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n sensor_name=graphql_data["sensorName"],\n )\n\n\n@whitelist_for_serdes\nclass InstigatorSelector(\n NamedTuple(\n "_InstigatorSelector", [("location_name", str), ("repository_name", str), ("name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, name: str):\n return super(InstigatorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n name=check.str_param(name, "name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "name": self.name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return InstigatorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n name=graphql_data["name"],\n )\n\n\nclass GraphSelector(\n NamedTuple(\n "_GraphSelector", [("location_name", str), ("repository_name", str), ("graph_name", str)]\n )\n):\n """The information needed to resolve a graph within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, graph_name: str):\n return super(GraphSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n graph_name=check.str_param(graph_name, "graph_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "graphName": self.graph_name,\n }\n\n\n@whitelist_for_serdes\nclass PartitionSetSelector(\n NamedTuple(\n "_PartitionSetSelector",\n [("location_name", str), ("repository_name", str), ("partition_set_name", str)],\n )\n):\n """The information needed to resolve a partition set within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, partition_set_name: str):\n return super(PartitionSetSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n partition_set_name=check.str_param(partition_set_name, "partition_set_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "partitionSetName": self.partition_set_name,\n }\n\n\nclass PartitionRangeSelector(\n NamedTuple(\n "_PartitionRangeSelector",\n [("start", str), ("end", str)],\n )\n):\n """The information needed to resolve a partition range."""\n\n def __new__(cls, start: str, end: str):\n return super(PartitionRangeSelector, cls).__new__(\n cls,\n start=check.inst_param(start, "start", str),\n end=check.inst_param(end, "end", str),\n )\n\n def to_graphql_input(self):\n return {\n "start": self.start,\n "end": self.end,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionRangeSelector(\n start=graphql_data["start"],\n end=graphql_data["end"],\n )\n\n\nclass PartitionsSelector(\n NamedTuple(\n "_PartitionsSelector",\n [("partition_range", PartitionRangeSelector)],\n )\n):\n """The information needed to define selection partitions.\n Using partition_range as property name to avoid shadowing Python 'range' builtin .\n """\n\n def __new__(cls, partition_range: PartitionRangeSelector):\n return super(PartitionsSelector, cls).__new__(\n cls,\n partition_range=check.inst_param(partition_range, "range", PartitionRangeSelector),\n )\n\n def to_graphql_input(self):\n return {\n "range": self.partition_range.to_graphql_input(),\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionsSelector(\n partition_range=PartitionRangeSelector.from_graphql_input(graphql_data["range"])\n )\n\n\nclass PartitionsByAssetSelector(\n NamedTuple(\n "PartitionsByAssetSelector",\n [\n ("asset_key", AssetKey),\n ("partitions", Optional[PartitionsSelector]),\n ],\n )\n):\n """The information needed to define partitions selection for a given asset key."""\n\n def __new__(cls, asset_key: AssetKey, partitions: Optional[PartitionsSelector] = None):\n return super(PartitionsByAssetSelector, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partitions=check.opt_inst_param(partitions, "partitions", PartitionsSelector),\n )\n\n def to_graphql_input(self):\n return {\n "assetKey": self.asset_key.to_graphql_input(),\n "partitions": self.partitions.to_graphql_input() if self.partitions else None,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n asset_key = graphql_data["assetKey"]\n partitions = graphql_data.get("partitions")\n return PartitionsByAssetSelector(\n asset_key=AssetKey.from_graphql_input(asset_key),\n partitions=PartitionsSelector.from_graphql_input(partitions) if partitions else None,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/selector", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.selector"}, "sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.sensor_definition

\nimport inspect\nimport logging\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n)\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.partition import (\n    CachingDynamicPartitionsLoader,\n)\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.resource_definition import (\n    Resources,\n)\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, normalize_to_repository\n\nfrom ..decorator_utils import (\n    get_function_params,\n)\nfrom .asset_selection import AssetSelection\nfrom .graph_definition import GraphDefinition\nfrom .run_request import (\n    AddDynamicPartitionsRequest,\n    DagsterRunReaction,\n    DeleteDynamicPartitionsRequest,\n    RunRequest,\n    SensorResult,\n    SkipReason,\n)\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n@whitelist_for_serdes\nclass SensorType(Enum):\n    STANDARD = "STANDARD"\n    RUN_STATUS = "RUN_STATUS"\n    ASSET = "ASSET"\n    MULTI_ASSET = "MULTI_ASSET"\n    FRESHNESS_POLICY = "FRESHNESS_POLICY"\n    UNKNOWN = "UNKNOWN"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\n
[docs]class SensorEvaluationContext:\n """The context object available as the argument to the evaluation function of a :py:class:`dagster.SensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `SensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_sensor_context`.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository or that\n the sensor belongs to. If needed by the sensor top-level resource definitions will be\n pulled from this repository. You can provide either this or `definitions`.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n resources (Optional[Dict[str, Any]]): A dict of resource keys to resource\n definitions to be made available during sensor execution.\n\n Example:\n .. code-block:: python\n\n from dagster import sensor, SensorEvaluationContext\n\n @sensor\n def the_sensor(context: SensorEvaluationContext):\n ...\n\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n self._sensor_name = sensor_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n\n self._log_key = (\n [\n repository_name,\n sensor_name,\n pendulum.now("UTC").strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and sensor_name\n else None\n )\n self._logger: Optional[InstigationLogger] = None\n self._cursor_updated = False\n\n def __enter__(self) -> "SensorEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "SensorEvaluationContext":\n """Merge the specified resources into this context.\n\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return SensorEvaluationContext(\n instance_ref=self._instance_ref,\n last_completion_time=self._last_completion_time,\n last_run_key=self._last_run_key,\n cursor=self._cursor,\n repository_name=self._repository_name,\n repository_def=self._repository_def,\n instance=self._instance,\n sensor_name=self._sensor_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n )\n\n @public\n @property\n def resources(self) -> Resources:\n """Resources: A mapping from resource key to instantiated resources for this sensor."""\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on sensors they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was"\n " provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n return self._instance_ref\n\n @public\n @property\n def last_completion_time(self) -> Optional[float]:\n """Optional[float]: Timestamp representing the last time this sensor completed an evaluation."""\n return self._last_completion_time\n\n @public\n @property\n def last_run_key(self) -> Optional[str]:\n """Optional[str]: The run key supplied to the most recent RunRequest produced by this sensor."""\n return self._last_run_key\n\n @public\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n
[docs] @public\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._cursor_updated = True
\n\n @property\n def cursor_updated(self) -> bool:\n return self._cursor_updated\n\n @public\n @property\n def repository_name(self) -> Optional[str]:\n """Optional[str]: The name of the repository that this sensor resides in."""\n return self._repository_name\n\n @public\n @property\n def repository_def(self) -> Optional["RepositoryDefinition"]:\n """Optional[RepositoryDefinition]: The RepositoryDefinition that this sensor resides in."""\n return self._repository_def\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key
\n\n\nRawSensorEvaluationFunctionReturn = Union[\n Iterator[Union[SkipReason, RunRequest, DagsterRunReaction, SensorResult]],\n Sequence[RunRequest],\n SkipReason,\n RunRequest,\n DagsterRunReaction,\n SensorResult,\n]\nRawSensorEvaluationFunction: TypeAlias = Callable[..., RawSensorEvaluationFunctionReturn]\n\nSensorEvaluationFunction: TypeAlias = Callable[..., Sequence[Union[SkipReason, RunRequest]]]\n\n\ndef get_context_param_name(fn: Callable) -> Optional[str]:\n """Determines the sensor's context parameter name by excluding all resource parameters."""\n resource_params = {param.name for param in get_resource_args(fn)}\n\n return next(\n (param.name for param in get_function_params(fn) if param.name not in resource_params), None\n )\n\n\ndef validate_and_get_resource_dict(\n resources: Resources, sensor_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by sensor '{sensor_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\ndef _check_dynamic_partitions_requests(\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n) -> None:\n req_keys_to_add_by_partitions_def_name = defaultdict(set)\n req_keys_to_delete_by_partitions_def_name = defaultdict(set)\n\n for req in dynamic_partitions_requests:\n duplicate_req_keys_to_delete = req_keys_to_delete_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n duplicate_req_keys_to_add = req_keys_to_add_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n if isinstance(req, AddDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_delete}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_add_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys:"\n f" {req_keys_to_add_by_partitions_def_name}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_delete_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n else:\n check.failed(f"Unexpected dynamic partition request type: {req}")\n\n\n
[docs]class SensorDefinition(IHasInternalInit):\n """Define a sensor that initiates a set of runs based on some external state.\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def with_updated_jobs(self, new_jobs: Sequence[ExecutableDefinition]) -> "SensorDefinition":\n """Returns a copy of this sensor with the jobs replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return SensorDefinition.dagster_internal_init(\n name=self.name,\n evaluation_fn=self._raw_fn,\n minimum_interval_seconds=self.minimum_interval_seconds,\n description=self.description,\n job_name=None, # if original init was passed job name, was resolved to a job\n jobs=new_jobs if len(new_jobs) > 1 else None,\n job=new_jobs[0] if len(new_jobs) == 1 else None,\n default_status=self.default_status,\n asset_selection=self.asset_selection,\n required_resource_keys=self._raw_required_resource_keys,\n )\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "SensorDefinition":\n """Returns a copy of this sensor with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return self.with_updated_jobs([new_job])\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n evaluation_fn: Optional[RawSensorEvaluationFunction] = None,\n job_name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if (\n sum(\n [\n int(job is not None),\n int(jobs is not None),\n int(job_name is not None),\n int(asset_selection is not None),\n ]\n )\n > 1\n ):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide more than one of 'job', 'jobs', 'job_name', and "\n "'asset_selection' params to SensorDefinition. Must provide only one."\n )\n\n jobs = jobs if jobs else [job] if job else None\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if job_name:\n targets = [\n RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n elif asset_selection:\n targets = []\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn: RawSensorEvaluationFunction = check.callable_param(\n evaluation_fn, "evaluation_fn"\n )\n self._evaluation_fn: Union[\n SensorEvaluationFunction,\n Callable[\n [SensorEvaluationContext],\n List[Union[SkipReason, RunRequest, DagsterRunReaction]],\n ],\n ] = wrap_sensor_evaluation(self._name, evaluation_fn)\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets: Sequence[Union[RepoRelativeTarget, DirectTarget]] = check.opt_list_param(\n targets, "targets", (DirectTarget, RepoRelativeTarget)\n )\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n self._asset_selection = check.opt_inst_param(\n asset_selection, "asset_selection", AssetSelection\n )\n validate_resource_annotated_function(self._raw_fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(self._raw_fn)}\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @sensor decorator and as arguments to"\n " the decorated function",\n )\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n evaluation_fn: Optional[RawSensorEvaluationFunction],\n job_name: Optional[str],\n minimum_interval_seconds: Optional[int],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n jobs: Optional[Sequence[ExecutableDefinition]],\n default_status: DefaultSensorStatus,\n asset_selection: Optional[AssetSelection],\n required_resource_keys: Optional[Set[str]],\n ) -> "SensorDefinition":\n return SensorDefinition(\n name=name,\n evaluation_fn=evaluation_fn,\n job_name=job_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name_if_present = get_context_param_name(self._raw_fn)\n context = get_or_create_sensor_context(self._raw_fn, *args, **kwargs)\n\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n\n resources = validate_and_get_resource_dict(\n context.resources, self.name, self._required_resource_keys\n )\n return self._raw_fn(**context_param, **resources)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this sensor."""\n return self._required_resource_keys\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this sensor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this sensor."""\n return self._description\n\n @public\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n """Optional[int]: The minimum number of seconds between sequential evaluations of this sensor."""\n return self._min_interval\n\n @property\n def targets(self) -> Sequence[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @public\n @property\n def job(self) -> Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].target\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @public\n @property\n def jobs(self) -> List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]: A list of jobs\n that are targeted by this schedule.\n """\n if self._targets and all(isinstance(target, DirectTarget) for target in self._targets):\n return [target.target for target in self._targets] # type: ignore # (illegible conditional)\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.STANDARD\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n context = check.inst_param(context, "context", SensorEvaluationContext)\n\n result = self._evaluation_fn(context)\n\n skip_message: Optional[str] = None\n run_requests: List[RunRequest] = []\n dagster_run_reactions: List[DagsterRunReaction] = []\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = []\n updated_cursor = context.cursor\n asset_events = []\n\n if not result or result == [None]:\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, DagsterRunReaction, SensorResult))\n\n if isinstance(item, SensorResult):\n run_requests = list(item.run_requests) if item.run_requests else []\n skip_message = (\n item.skip_reason.skip_message\n if item.skip_reason\n else (None if run_requests else "Sensor function returned an empty result")\n )\n\n _check_dynamic_partitions_requests(\n item.dynamic_partitions_requests or [],\n )\n dynamic_partitions_requests = item.dynamic_partitions_requests or []\n\n if item.cursor and context.cursor_updated:\n raise DagsterInvariantViolationError(\n "SensorResult.cursor cannot be set if context.update_cursor() was called."\n )\n updated_cursor = item.cursor\n asset_events = item.asset_events\n\n elif isinstance(item, RunRequest):\n run_requests = [item]\n elif isinstance(item, SkipReason):\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n elif isinstance(item, DagsterRunReaction):\n dagster_run_reactions = (\n [cast(DagsterRunReaction, item)] if isinstance(item, DagsterRunReaction) else []\n )\n else:\n check.failed(f"Unexpected type {type(item)} in sensor result")\n else:\n if any(isinstance(item, SensorResult) for item in result):\n check.failed(\n "When a SensorResult is returned from a sensor, it must be the only object"\n " returned."\n )\n\n check.is_list(result, (SkipReason, RunRequest, DagsterRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n run_requests = [item for item in result if isinstance(item, RunRequest)]\n dagster_run_reactions = [\n item for item in result if isinstance(item, DagsterRunReaction)\n ]\n\n if has_skip:\n if len(run_requests) > 0:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif len(dagster_run_reactions) > 0:\n check.failed(\n "Expected a single SkipReason or one or more DagsterRunReaction: "\n "received both DagsterRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n _check_dynamic_partitions_requests(dynamic_partitions_requests)\n resolved_run_requests = self.resolve_run_requests(\n run_requests, context, self._asset_selection, dynamic_partitions_requests\n )\n\n return SensorExecutionData(\n resolved_run_requests,\n skip_message,\n updated_cursor,\n dagster_run_reactions,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(\n self,\n ) -> Sequence[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """Returns job/graph definitions that have been directly passed into the sensor definition.\n Any jobs or graphs that are referenced by name will not be loaded.\n """\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def resolve_run_requests(\n self,\n run_requests: Sequence[RunRequest],\n context: SensorEvaluationContext,\n asset_selection: Optional[AssetSelection],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n ) -> Sequence[RunRequest]:\n def _get_repo_job_by_name(context: SensorEvaluationContext, job_name: str) -> JobDefinition:\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_sensor_context when yielding partitioned"\n " run requests"\n )\n return context.repository_def.get_job(job_name)\n\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.job_name for target in self._targets]\n\n if run_requests and len(self._targets) == 0 and not self._asset_selection:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (job_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or job_name to the @sensor "\n "decorator."\n )\n\n if asset_selection:\n run_requests = [\n *_run_requests_with_base_asset_jobs(run_requests, context, asset_selection)\n ]\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # Run requests may contain an invalid target, or a partition key that does not exist.\n # We will resolve these run requests, applying the target and partition config/tags.\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not"\n " specify job_name for the requested run. Expected one of:"\n f" {target_names}"\n )\n elif (\n run_request.job_name\n and run_request.job_name not in target_names\n and not asset_selection\n ):\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n if run_request.partition_key and not run_request.has_resolved_partition():\n selected_job = _get_repo_job_by_name(\n context, run_request.job_name if run_request.job_name else target_names[0]\n )\n resolved_run_requests.append(\n run_request.with_resolved_tags_and_config(\n target_definition=selected_job,\n current_time=None,\n dynamic_partitions_store=dynamic_partitions_store,\n dynamic_partitions_requests=dynamic_partitions_requests,\n )\n )\n else:\n resolved_run_requests.append(run_request)\n\n return resolved_run_requests\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @public\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The name of the job that is targeted by this sensor."""\n if len(self._targets) > 1:\n raise DagsterInvalidInvocationError(\n f"Cannot use `job_name` property for sensor {self.name}, which targets multiple"\n " jobs."\n )\n return self._targets[0].job_name\n\n @public\n @property\n def default_status(self) -> DefaultSensorStatus:\n """DefaultSensorStatus: The default status for this sensor when it is first loaded in\n a code location.\n """\n return self._default_status\n\n @property\n def asset_selection(self) -> Optional[AssetSelection]:\n return self._asset_selection
\n\n\n@whitelist_for_serdes(\n storage_field_names={"dagster_run_reactions": "pipeline_run_reactions"},\n)\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("dagster_run_reactions", Optional[Sequence[DagsterRunReaction]]),\n ("captured_log_key", Optional[Sequence[str]]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]]\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]]\n ] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_sequence_param(dagster_run_reactions, "dagster_run_reactions", DagsterRunReaction)\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n )\n check.opt_sequence_param(\n asset_events,\n "asset_events",\n (AssetMaterialization, AssetObservation, AssetCheckEvaluation),\n )\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n dagster_run_reactions=dagster_run_reactions,\n captured_log_key=captured_log_key,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events or [],\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: RawSensorEvaluationFunction,\n) -> SensorEvaluationFunction:\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: SensorEvaluationContext):\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, sensor_name, resource_arg_names\n )\n\n context_param_name_if_present = get_context_param_name(fn)\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n raw_evaluation_result = fn(**context_param, **resource_args_populated)\n\n def check_returned_scalar(scalar):\n if isinstance(scalar, (SkipReason, RunRequest, SensorResult)):\n return scalar\n elif scalar is not None:\n raise Exception(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{scalar} of type {type(scalar)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n if inspect.isgenerator(raw_evaluation_result):\n result = []\n try:\n while True:\n result.append(next(raw_evaluation_result))\n except StopIteration as e:\n # captures the case where the evaluation function has a yield and also returns a\n # value\n if e.value is not None:\n result.append(check_returned_scalar(e.value))\n\n return result\n elif isinstance(raw_evaluation_result, list):\n return raw_evaluation_result\n else:\n return [check_returned_scalar(raw_evaluation_result)]\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n resources (Optional[Mapping[str, ResourceDefinition]]): A set of resource definitions\n to provide to the sensor. If passed, these will override any resource definitions\n provided by the repository.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Examples:\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n\n return SensorEvaluationContext(\n instance_ref=instance_ref,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n sensor_name=sensor_name,\n resources=wrap_resources_for_execution(resources),\n )
\n\n\nT = TypeVar("T")\n\n\ndef get_sensor_context_from_args_or_kwargs(\n fn: Callable,\n args: Tuple[Any, ...],\n kwargs: Dict[str, Any],\n context_type: Type[T],\n) -> Optional[T]:\n from dagster._config.pythonic_config import is_coercible_to_resource\n\n context_param_name = get_context_param_name(fn)\n\n kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n if len(args) + len(kwarg_keys_non_resource) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple non-resource arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n if any(is_coercible_to_resource(arg) for arg in args):\n raise DagsterInvalidInvocationError(\n "If directly invoking a sensor, you may not provide resources as"\n " positional"\n " arguments, only as keyword arguments."\n )\n\n context: Optional[T] = None\n\n if len(args) > 0:\n context = check.opt_inst(args[0], context_type)\n elif len(kwargs) > 0:\n if context_param_name and context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst(kwargs.get(context_param_name or "context"), context_type)\n elif context_param_name:\n # If the context parameter is present but no value was provided, we error\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n\n return context\n\n\ndef get_or_create_sensor_context(\n fn: Callable,\n *args: Any,\n **kwargs: Any,\n) -> SensorEvaluationContext:\n """Based on the passed resource function and the arguments passed to it, returns the\n user-passed SensorEvaluationContext or creates one if it is not passed.\n\n Raises an exception if the user passes more than one argument or if the user-provided\n function requires a context parameter but none is passed.\n """\n context = (\n get_sensor_context_from_args_or_kwargs(\n fn,\n args,\n kwargs,\n context_type=SensorEvaluationContext,\n )\n or build_sensor_context()\n )\n resource_args_from_kwargs = {}\n\n resource_args = {param.name for param in get_resource_args(fn)}\n for resource_arg in resource_args:\n if resource_arg in kwargs:\n resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n if resource_args_from_kwargs:\n return context.merge_resources(resource_args_from_kwargs)\n\n return context\n\n\ndef _run_requests_with_base_asset_jobs(\n run_requests: Iterable[RunRequest],\n context: SensorEvaluationContext,\n outer_asset_selection: AssetSelection,\n) -> Sequence[RunRequest]:\n """For sensors that target asset selections instead of jobs, finds the corresponding base asset\n for a selected set of assets.\n """\n asset_graph = context.repository_def.asset_graph # type: ignore # (possible none)\n result = []\n for run_request in run_requests:\n if run_request.asset_selection:\n asset_keys = run_request.asset_selection\n\n unexpected_asset_keys = (\n AssetSelection.keys(*asset_keys) - outer_asset_selection\n ).resolve(asset_graph)\n if unexpected_asset_keys:\n raise DagsterInvalidSubsetError(\n "RunRequest includes asset keys that are not part of sensor's asset_selection:"\n f" {unexpected_asset_keys}"\n )\n else:\n asset_keys = outer_asset_selection.resolve(asset_graph)\n\n base_job = context.repository_def.get_implicit_job_def_for_assets(asset_keys) # type: ignore # (possible none)\n result.append(\n run_request.with_replaced_attrs(\n job_name=base_job.name, asset_selection=list(asset_keys) # type: ignore # (possible none)\n )\n )\n\n return result\n
", "current_page_name": "_modules/dagster/_core/definitions/sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.sensor_definition"}, "source_asset": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.source_asset

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param, public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    DataVersion,\n    DataVersionsByPartition,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataMapping,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceAddable,\n    ResourceRequirement,\n    SourceAssetIOManagerRequirement,\n    ensure_requirements_satisfied,\n    get_resource_key_conflicts,\n)\nfrom dagster._core.definitions.utils import (\n    DEFAULT_GROUP_NAME,\n    DEFAULT_IO_MANAGER_KEY,\n    validate_group_name,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidObservationError,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n    )\nfrom dagster._core.storage.io_manager import IOManagerDefinition\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n# Going with this catch-all for the time-being to permit pythonic resources\nSourceAssetObserveFunction: TypeAlias = Callable[..., Any]\n\n\ndef wrap_source_asset_observe_fn_in_op_compute_fn(\n    source_asset: "SourceAsset",\n) -> "DecoratedOpFunction":\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n        is_context_provided,\n    )\n    from dagster._core.execution.context.compute import (\n        OpExecutionContext,\n    )\n\n    check.not_none(source_asset.observe_fn, "Must be an observable source asset")\n    assert source_asset.observe_fn  # for type checker\n\n    observe_fn = source_asset.observe_fn\n\n    observe_fn_has_context = is_context_provided(get_function_params(observe_fn))\n\n    def fn(context: OpExecutionContext) -> None:\n        resource_kwarg_keys = [param.name for param in get_resource_args(observe_fn)]\n        resource_kwargs = {key: getattr(context.resources, key) for key in resource_kwarg_keys}\n        observe_fn_return_value = (\n            observe_fn(context, **resource_kwargs)\n            if observe_fn_has_context\n            else observe_fn(**resource_kwargs)\n        )\n\n        if isinstance(observe_fn_return_value, DataVersion):\n            if source_asset.partitions_def is not None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is partitioned, so its observe function should return a"\n                    " DataVersionsByPartition, not a DataVersion"\n                )\n\n            context.log_event(\n                AssetObservation(\n                    asset_key=source_asset.key,\n                    tags={DATA_VERSION_TAG: observe_fn_return_value.value},\n                )\n            )\n        elif isinstance(observe_fn_return_value, DataVersionsByPartition):\n            if source_asset.partitions_def is None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is not partitioned, so its observe function should return"\n                    " a DataVersion, not a DataVersionsByPartition"\n                )\n\n            for (\n                partition_key,\n                data_version,\n            ) in observe_fn_return_value.data_versions_by_partition.items():\n                context.log_event(\n                    AssetObservation(\n                        asset_key=source_asset.key,\n                        tags={DATA_VERSION_TAG: data_version.value},\n                        partition=partition_key,\n                    )\n                )\n        else:\n            raise DagsterInvalidObservationError(\n                f"Observe function for {source_asset.key} must return a DataVersion or"\n                " DataVersionsByPartition, but returned a value of type"\n                f" {type(observe_fn_return_value)}"\n            )\n\n    return DecoratedOpFunction(fn)\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\nclass SourceAsset(ResourceAddable):\n """A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.\n\n Attributes:\n key (Union[AssetKey, Sequence[str], str]): The key of the asset.\n metadata (Mapping[str, MetadataValue]): Metadata associated with the asset.\n io_manager_key (Optional[str]): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n io_manager_def (Optional[IOManagerDefinition]): (Experimental) The definition of the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): (Experimental) resource definitions that may be required by the :py:class:`dagster.IOManagerDefinition` provided in the `io_manager_def` argument.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n observe_fn (Optional[SourceAssetObserveFunction]) Observation function for the source asset.\n """\n\n key: PublicAttr[AssetKey]\n metadata: PublicAttr[MetadataMapping]\n raw_metadata: PublicAttr[ArbitraryMetadataMapping]\n io_manager_key: PublicAttr[Optional[str]]\n _io_manager_def: PublicAttr[Optional[IOManagerDefinition]]\n description: PublicAttr[Optional[str]]\n partitions_def: PublicAttr[Optional[PartitionsDefinition]]\n group_name: PublicAttr[str]\n resource_defs: PublicAttr[Dict[str, ResourceDefinition]]\n observe_fn: PublicAttr[Optional[SourceAssetObserveFunction]]\n _node_def: Optional[OpDefinition] # computed lazily\n auto_observe_interval_minutes: Optional[float]\n\n def __init__(\n self,\n key: CoercibleToAssetKey,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n group_name: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n observe_fn: Optional[SourceAssetObserveFunction] = None,\n *,\n auto_observe_interval_minutes: Optional[float] = None,\n # This is currently private because it is necessary for source asset observation functions,\n # but we have not yet decided on a final API for associated one or more ops with a source\n # asset. If we were to make this public, then we would have a canonical public\n # `required_resource_keys` used for observation that might end up conflicting with a set of\n # required resource keys for a different operation.\n _required_resource_keys: Optional[AbstractSet[str]] = None,\n # Add additional fields to with_resources and with_group below\n ):\n from dagster._core.execution.build_resources import (\n wrap_resources_for_execution,\n )\n\n self.key = AssetKey.from_coercible(key)\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self.raw_metadata = metadata\n self.metadata = normalize_metadata(metadata, allow_invalid=True)\n\n resource_defs_dict = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n if io_manager_def:\n if not io_manager_key:\n io_manager_key = self.key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in resource_defs_dict\n and resource_defs_dict[io_manager_key] != io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = io_manager_def\n\n self.resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n self.io_manager_key = check.opt_str_param(io_manager_key, "io_manager_key")\n self.partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n self.group_name = validate_group_name(group_name)\n self.description = check.opt_str_param(description, "description")\n self.observe_fn = check.opt_callable_param(observe_fn, "observe_fn")\n self._required_resource_keys = check.opt_set_param(\n _required_resource_keys, "_required_resource_keys", of_type=str\n )\n self._node_def = None\n self.auto_observe_interval_minutes = check.opt_numeric_param(\n auto_observe_interval_minutes, "auto_observe_interval_minutes"\n )\n\n def get_io_manager_key(self) -> str:\n return self.io_manager_key or DEFAULT_IO_MANAGER_KEY\n\n @property\n def io_manager_def(self) -> Optional[IOManagerDefinition]:\n io_manager_key = self.get_io_manager_key()\n return cast(\n Optional[IOManagerDefinition],\n self.resource_defs.get(io_manager_key) if io_manager_key else None,\n )\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: The OpDefinition associated with the observation function of an observable\n source asset.\n\n Throws an error if the asset is not observable.\n """\n check.invariant(\n isinstance(self.node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self.node_def)\n\n @public\n @property\n def is_observable(self) -> bool:\n """bool: Whether the asset is observable."""\n return self.node_def is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n @property\n def node_def(self) -> Optional[OpDefinition]:\n """Op that generates observation metadata for a source asset."""\n if self.observe_fn is None:\n return None\n\n if self._node_def is None:\n self._node_def = OpDefinition(\n compute_fn=wrap_source_asset_observe_fn_in_op_compute_fn(self),\n name=self.key.to_python_identifier(),\n description=self.description,\n required_resource_keys=self._required_resource_keys,\n )\n return self._node_def\n\n def with_resources(self, resource_defs) -> "SourceAsset":\n from dagster._core.execution.resources_init import get_transitive_required_resource_keys\n\n overlapping_keys = get_resource_key_conflicts(self.resource_defs, resource_defs)\n if overlapping_keys:\n raise DagsterInvalidInvocationError(\n f"SourceAsset with key {self.key} has conflicting resource "\n "definitions with provided resources for the following keys: "\n f"{sorted(list(overlapping_keys))}. Either remove the existing "\n "resources from the asset or change the resource keys so that "\n "they don't overlap."\n )\n\n merged_resource_defs = merge_dicts(resource_defs, self.resource_defs)\n\n # Ensure top-level resource requirements are met - except for\n # io_manager, since that is a default it can be resolved later.\n ensure_requirements_satisfied(merged_resource_defs, list(self.get_resource_requirements()))\n\n io_manager_def = merged_resource_defs.get(self.get_io_manager_key())\n if not io_manager_def and self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with asset key {self.key} requires IO manager with key"\n f" '{self.get_io_manager_key()}', but none was provided."\n )\n relevant_keys = get_transitive_required_resource_keys(\n {*self._required_resource_keys, self.get_io_manager_key()}, merged_resource_defs\n )\n\n relevant_resource_defs = {\n key: resource_def\n for key, resource_def in merged_resource_defs.items()\n if key in relevant_keys\n }\n\n io_manager_key = (\n self.get_io_manager_key()\n if self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY\n else None\n )\n with disable_dagster_warnings():\n return SourceAsset(\n key=self.key,\n io_manager_key=io_manager_key,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.raw_metadata,\n resource_defs=relevant_resource_defs,\n group_name=self.group_name,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def with_attributes(\n self, group_name: Optional[str] = None, key: Optional[AssetKey] = None\n ) -> "SourceAsset":\n if group_name is not None and self.group_name != DEFAULT_GROUP_NAME:\n raise DagsterInvalidDefinitionError(\n "A group name has already been provided to source asset"\n f" {self.key.to_user_string()}"\n )\n\n with disable_dagster_warnings():\n return SourceAsset(\n key=key or self.key,\n metadata=self.raw_metadata,\n io_manager_key=self.io_manager_key,\n io_manager_def=self.io_manager_def,\n description=self.description,\n partitions_def=self.partitions_def,\n group_name=group_name,\n resource_defs=self.resource_defs,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n if self.node_def is not None:\n yield from self.node_def.get_resource_requirements()\n yield SourceAssetIOManagerRequirement(\n key=self.get_io_manager_key(), asset_key=self.key.to_string()\n )\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, SourceAsset):\n return False\n else:\n return (\n self.key == other.key\n and self.raw_metadata == other.raw_metadata\n and self.io_manager_key == other.io_manager_key\n and self.description == other.description\n and self.group_name == other.group_name\n and self.resource_defs == other.resource_defs\n and self.observe_fn == other.observe_fn\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/source_asset", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.source_asset"}, "step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.step_launcher

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.state import KnownExecutionState\n\n\n
[docs]class StepRunRef(\n NamedTuple(\n "_StepRunRef",\n [\n ("run_config", Mapping[str, object]),\n ("dagster_run", DagsterRun),\n ("run_id", str),\n ("retry_mode", RetryMode),\n ("step_key", str),\n ("recon_job", ReconstructableJob),\n ("known_state", Optional["KnownExecutionState"]),\n ],\n )\n):\n """A serializable object that specifies what's needed to hydrate a step so\n that it can be executed in a process outside the plan process.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n run_config: Mapping[str, object],\n dagster_run: DagsterRun,\n run_id: str,\n retry_mode: RetryMode,\n step_key: str,\n recon_job: ReconstructableJob,\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.plan.state import KnownExecutionState\n\n return super(StepRunRef, cls).__new__(\n cls,\n check.mapping_param(run_config, "run_config", key_type=str),\n check.inst_param(dagster_run, "dagster_run", DagsterRun),\n check.str_param(run_id, "run_id"),\n check.inst_param(retry_mode, "retry_mode", RetryMode),\n check.str_param(step_key, "step_key"),\n check.inst_param(recon_job, "recon_job", ReconstructableJob),\n check.opt_inst_param(known_state, "known_state", KnownExecutionState),\n )
\n\n\n
[docs]class StepLauncher(ABC):\n """A StepLauncher is responsible for executing steps, either in-process or in an external process."""\n\n @abstractmethod\n def launch_step(self, step_context: "StepExecutionContext") -> Iterator["DagsterEvent"]:\n """Args:\n step_context (StepExecutionContext): The context that we're executing the step in.\n\n Returns:\n Iterator[DagsterEvent]: The events for the step.\n """
\n
", "current_page_name": "_modules/dagster/_core/definitions/step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.step_launcher"}, "time_window_partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partition_mapping

\nfrom datetime import datetime\nfrom typing import NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_mapping import PartitionMapping, UpstreamPartitionsResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    TimeWindowPartitionsSubset,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\n@experimental_param(param="allow_nonexistent_upstream_partitions")\nclass TimeWindowPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_TimeWindowPartitionMapping",\n [\n ("start_offset", PublicAttr[int]),\n ("end_offset", PublicAttr[int]),\n ("allow_nonexistent_upstream_partitions", PublicAttr[bool]),\n ],\n ),\n):\n """The default mapping between two TimeWindowPartitionsDefinitions.\n\n A partition in the downstream partitions definition is mapped to all partitions in the upstream\n asset whose time windows overlap it.\n\n This means that, if the upstream and downstream partitions definitions share the same time\n period, then this mapping is essentially the identity partition mapping - plus conversion of\n datetime formats.\n\n If the upstream time period is coarser than the downstream time period, then each partition in\n the downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\n hourly and the upstream is daily, then each hourly partition in the downstream will map to the\n daily partition in the upstream that contains that hour.\n\n If the upstream time period is finer than the downstream time period, then each partition in the\n downstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\n the upstream is hourly, then each daily partition in the downstream asset will map to the 24\n hourly partitions in the upstream that occur on that day.\n\n Attributes:\n start_offset (int): If not 0, then the starts of the upstream windows are shifted by this\n offset relative to the starts of the downstream windows. For example, if start_offset=-1\n and end_offset=0, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-03" and "2022-07-04". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n end_offset (int): If not 0, then the ends of the upstream windows are shifted by this\n offset relative to the ends of the downstream windows. For example, if start_offset=0\n and end_offset=1, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-04" and "2022-07-05". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n allow_nonexistent_upstream_partitions (bool): Defaults to false. If true, does not\n raise an error when mapped upstream partitions fall outside the start-end time window of the\n partitions def. For example, if the upstream partitions def starts on "2023-01-01" but\n the downstream starts on "2022-01-01", setting this bool to true would return no\n partition keys when get_upstream_partitions_for_partitions is called with "2022-06-01".\n When set to false, would raise an error.\n\n Examples:\n .. code-block:: python\n\n from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\n partitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n @asset(partitions_def=partitions_def)\n def asset1():\n ...\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "asset1": AssetIn(\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n )\n }\n )\n def asset2(asset1):\n ...\n """\n\n def __new__(\n cls,\n start_offset: int = 0,\n end_offset: int = 0,\n allow_nonexistent_upstream_partitions: bool = False,\n ):\n return super(TimeWindowPartitionMapping, cls).__new__(\n cls,\n start_offset=check.int_param(start_offset, "start_offset"),\n end_offset=check.int_param(end_offset, "end_offset"),\n allow_nonexistent_upstream_partitions=check.bool_param(\n allow_nonexistent_upstream_partitions,\n "allow_nonexistent_upstream_partitions",\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if not isinstance(downstream_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("downstream_partitions_subset must be a TimeWindowPartitionsSubset")\n\n return self._map_partitions(\n downstream_partitions_subset.partitions_def,\n upstream_partitions_def,\n downstream_partitions_subset,\n start_offset=self.start_offset,\n end_offset=self.end_offset,\n current_time=current_time,\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: Optional[PartitionsDefinition],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the partitions in the downstream asset that map to the given upstream partitions.\n\n Filters for partitions that exist at the given current_time, fetching the current time\n if not provided.\n """\n return self._map_partitions(\n upstream_partitions_subset.partitions_def,\n downstream_partitions_def,\n upstream_partitions_subset,\n end_offset=-self.start_offset,\n start_offset=-self.end_offset,\n current_time=current_time,\n ).partitions_subset\n\n def _map_partitions(\n self,\n from_partitions_def: PartitionsDefinition,\n to_partitions_def: Optional[PartitionsDefinition],\n from_partitions_subset: PartitionsSubset,\n start_offset: int,\n end_offset: int,\n current_time: Optional[datetime] = None,\n ) -> UpstreamPartitionsResult:\n """Maps the partitions in from_partitions_subset to partitions in to_partitions_def.\n\n If partitions in from_partitions_subset represent time windows that do not exist in\n to_partitions_def, raises an error if raise_error_on_invalid_mapped_partition is True.\n Otherwise, filters out the partitions that do not exist in to_partitions_def and returns\n the filtered subset, also returning a bool indicating whether there were mapped time windows\n that did not exist in to_partitions_def.\n """\n if not isinstance(from_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("from_partitions_subset must be a TimeWindowPartitionsSubset")\n\n if not isinstance(from_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("from_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if not isinstance(to_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("to_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if (start_offset != 0 or end_offset != 0) and (\n from_partitions_def.cron_schedule != to_partitions_def.cron_schedule\n ):\n raise DagsterInvalidDefinitionError(\n "Can't use the start_offset or end_offset parameters of"\n " TimeWindowPartitionMapping when the cron schedule of the upstream"\n " PartitionsDefinition is different than the cron schedule of the downstream"\n f" one. Attempted to map from cron schedule '{from_partitions_def.cron_schedule}' "\n f"to cron schedule '{to_partitions_def.cron_schedule}'."\n )\n\n if to_partitions_def.timezone != from_partitions_def.timezone:\n raise DagsterInvalidDefinitionError("Timezones don't match")\n\n # skip fancy mapping logic in the simple case\n if from_partitions_def == to_partitions_def and start_offset == 0 and end_offset == 0:\n return UpstreamPartitionsResult(from_partitions_subset, [])\n\n time_windows = []\n for from_partition_time_window in from_partitions_subset.included_time_windows:\n from_start_dt, from_end_dt = from_partition_time_window\n offsetted_start_dt = _offsetted_datetime(\n from_partitions_def, from_start_dt, start_offset\n )\n offsetted_end_dt = _offsetted_datetime(from_partitions_def, from_end_dt, end_offset)\n\n to_start_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_start_dt.timestamp(), end_closed=False\n )\n if offsetted_start_dt is not None\n else None\n )\n to_end_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_end_dt.timestamp(), end_closed=True\n )\n if offsetted_end_dt is not None\n else None\n )\n\n if to_start_partition_key is not None or to_end_partition_key is not None:\n window_start = (\n to_partitions_def.start_time_for_partition_key(to_start_partition_key)\n if to_start_partition_key\n else cast(TimeWindow, to_partitions_def.get_first_partition_window()).start\n )\n window_end = (\n to_partitions_def.end_time_for_partition_key(to_end_partition_key)\n if to_end_partition_key\n else cast(TimeWindow, to_partitions_def.get_last_partition_window()).end\n )\n\n if window_start < window_end:\n time_windows.append(TimeWindow(window_start, window_end))\n\n first_window = to_partitions_def.get_first_partition_window(current_time=current_time)\n last_window = to_partitions_def.get_last_partition_window(current_time=current_time)\n\n filtered_time_windows = []\n required_but_nonexistent_partition_keys = set()\n\n for time_window in time_windows:\n if (\n first_window\n and last_window\n and time_window.start <= last_window.start\n and time_window.end >= first_window.end\n ):\n window_start = max(time_window.start, first_window.start)\n window_end = min(time_window.end, last_window.end)\n filtered_time_windows.append(TimeWindow(window_start, window_end))\n\n if self.allow_nonexistent_upstream_partitions:\n # If allowed to have nonexistent upstream partitions, do not consider\n # out of range partitions to be invalid\n continue\n else:\n invalid_time_window = None\n if not (first_window and last_window) or (\n time_window.start < first_window.start and time_window.end > last_window.end\n ):\n invalid_time_window = time_window\n elif time_window.start < first_window.start:\n invalid_time_window = TimeWindow(\n time_window.start, min(time_window.end, first_window.start)\n )\n elif time_window.end > last_window.end:\n invalid_time_window = TimeWindow(\n max(time_window.start, last_window.end), time_window.end\n )\n\n if invalid_time_window:\n required_but_nonexistent_partition_keys.update(\n set(\n to_partitions_def.get_partition_keys_in_time_window(\n time_window=invalid_time_window\n )\n )\n )\n\n return UpstreamPartitionsResult(\n TimeWindowPartitionsSubset(\n to_partitions_def,\n num_partitions=sum(\n len(to_partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in filtered_time_windows\n ),\n included_time_windows=filtered_time_windows,\n ),\n sorted(list(required_but_nonexistent_partition_keys)),\n )
\n\n\ndef _offsetted_datetime(\n partitions_def: TimeWindowPartitionsDefinition, dt: datetime, offset: int\n) -> Optional[datetime]:\n for _ in range(abs(offset)):\n if offset < 0:\n prev_window = partitions_def.get_prev_partition_window(dt)\n if prev_window is None:\n return None\n\n dt = prev_window.start\n else:\n # TODO: what if we're at the end of the line?\n next_window = partitions_def.get_next_partition_window(dt)\n if next_window is None:\n return None\n\n dt = next_window.end\n\n return dt\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partition_mapping"}, "time_window_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partitions

\nimport functools\nimport hashlib\nimport json\nimport re\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    FrozenSet,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster._utils.schedules import (\n    cron_string_iterator,\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n)\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    PartitionedConfig,\n    PartitionsDefinition,\n    PartitionsSubset,\n    ScheduleType,\n    cron_schedule_from_schedule_type_and_offsets,\n)\nfrom .partition_key_range import PartitionKeyRange\n\n\n
[docs]class TimeWindow(NamedTuple):\n """An interval that is closed at the start and open at the end.\n\n Attributes:\n start (datetime): A pendulum datetime that marks the start of the window.\n end (datetime): A pendulum datetime that marks the end of the window.\n """\n\n start: PublicAttr[datetime]\n end: PublicAttr[datetime]
\n\n\n
[docs]class TimeWindowPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_TimeWindowPartitionsDefinition",\n [\n ("start", PublicAttr[datetime]),\n ("timezone", PublicAttr[str]),\n ("end", PublicAttr[Optional[datetime]]),\n ("fmt", PublicAttr[str]),\n ("end_offset", PublicAttr[int]),\n ("cron_schedule", PublicAttr[str]),\n ],\n ),\n):\n r"""A set of partitions where each partitions corresponds to a time window.\n\n The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n "0 0 \\\\* \\\\* \\\\*" will result in daily partitions that start at midnight and end at midnight of the\n following day.\n\n The string partition_key associated with each partition corresponds to the start of the\n partition's time window.\n\n The first partition in the set will start on at the first cron_schedule tick that is equal to\n or after the given start datetime. The last partition in the set will end before the current\n time, unless the end_offset argument is set to a positive number.\n\n Args:\n cron_schedule (str): Determines the bounds of the time windows.\n start (datetime): The first partition in the set will start on at the first cron_schedule\n tick that is equal to or after this value.\n timezone (Optional[str]): The timezone in which each time should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end (datetime): The last partition (excluding) in the set.\n fmt (str): The date format to use for partition_keys.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n """\n\n def __new__(\n cls,\n start: Union[datetime, str],\n fmt: str,\n end: Union[datetime, str, None] = None,\n schedule_type: Optional[ScheduleType] = None,\n timezone: Optional[str] = None,\n end_offset: int = 0,\n minute_offset: Optional[int] = None,\n hour_offset: Optional[int] = None,\n day_offset: Optional[int] = None,\n cron_schedule: Optional[str] = None,\n ):\n check.opt_str_param(timezone, "timezone")\n timezone = timezone or "UTC"\n\n if isinstance(start, datetime):\n start_dt = pendulum.instance(start, tz=timezone)\n else:\n start_dt = pendulum.instance(datetime.strptime(start, fmt), tz=timezone)\n\n if not end:\n end_dt = None\n elif isinstance(end, datetime):\n end_dt = pendulum.instance(end, tz=timezone)\n else:\n end_dt = pendulum.instance(datetime.strptime(end, fmt), tz=timezone)\n\n if cron_schedule is not None:\n check.invariant(\n schedule_type is None and not minute_offset and not hour_offset and not day_offset,\n "If cron_schedule argument is provided, then schedule_type, minute_offset, "\n "hour_offset, and day_offset can't also be provided",\n )\n else:\n if schedule_type is None:\n check.failed("One of schedule_type and cron_schedule must be provided")\n\n cron_schedule = cron_schedule_from_schedule_type_and_offsets(\n schedule_type=schedule_type,\n minute_offset=minute_offset or 0,\n hour_offset=hour_offset or 0,\n day_offset=day_offset or 0,\n )\n\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{cron_schedule}' for a"\n " TimeWindowPartitionsDefinition."\n )\n\n return super(TimeWindowPartitionsDefinition, cls).__new__(\n cls, start_dt, timezone, end_dt, fmt, end_offset, cron_schedule\n )\n\n def get_current_timestamp(self, current_time: Optional[datetime] = None) -> float:\n return (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ).timestamp()\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Method added for performance reasons.\n # Fetching partition keys requires significantly more compute time to\n # string format datetimes.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n\n num_partitions = 0\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n num_partitions += 1\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n num_partitions += self.end_offset\n\n return num_partitions\n\n def get_partition_keys_between_indexes(\n self, start_idx: int, end_idx: int, current_time: Optional[datetime] = None\n ) -> List[str]:\n # Fetches the partition keys between the given start and end indices.\n # Start index is inclusive, end index is exclusive.\n # Method added for performance reasons, to only string format\n # partition keys included within the indices.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys = []\n reached_end = False\n\n for idx, time_window in enumerate(self._iterate_time_windows(self.start)):\n if time_window.end.timestamp() >= current_timestamp:\n reached_end = True\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n reached_end = True\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n if idx >= start_idx and idx < end_idx:\n partition_keys.append(time_window.start.strftime(self.fmt))\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n if len(partition_keys) >= end_idx - start_idx:\n break\n\n if reached_end and self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys: List[str] = []\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n partition_keys.append(time_window.start.strftime(self.fmt))\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def _get_validated_time_window_for_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n """Returns a TimeWindow for the given partition key if it is valid, otherwise returns None."""\n try:\n time_window = self.time_window_for_partition_key(partition_key)\n except ValueError:\n return None\n\n first_partition_window = self.get_first_partition_window(current_time=current_time)\n last_partition_window = self.get_last_partition_window(current_time=current_time)\n if (\n first_partition_window is None\n or last_partition_window is None\n or time_window.start < first_partition_window.start\n or time_window.start > last_partition_window.start\n or time_window.start.strftime(self.fmt) != partition_key\n ):\n return None\n\n return time_window\n\n def __str__(self) -> str:\n schedule_str = (\n self.schedule_type.value.capitalize() if self.schedule_type else self.cron_schedule\n )\n partition_def_str = (\n f"{schedule_str}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n )\n if self.end_offset != 0:\n partition_def_str += (\n " End offsetted by"\n f" {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n )\n return partition_def_str\n\n def __repr__(self):\n # Between python 3.8 and 3.9 the repr of a datetime object changed.\n # Replaces start time with timestamp as a workaround to make sure the repr is consistent across versions.\n return (\n f"TimeWindowPartitionsDefinition(start={self.start.timestamp()},"\n f" timezone='{self.timezone}', fmt='{self.fmt}', end_offset={self.end_offset},"\n f" cron_schedule='{self.cron_schedule}')"\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n @functools.lru_cache(maxsize=100)\n def _time_window_for_partition_key(self, *, partition_key: str) -> TimeWindow:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return next(iter(self._iterate_time_windows(partition_key_dt)))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n return self._time_window_for_partition_key(partition_key=partition_key)\n\n @functools.lru_cache(maxsize=5)\n def time_windows_for_partition_keys(\n self,\n partition_keys: FrozenSet[str],\n validate: bool = True,\n ) -> Sequence[TimeWindow]:\n if len(partition_keys) == 0:\n return []\n\n sorted_pks = sorted(partition_keys, key=lambda pk: datetime.strptime(pk, self.fmt))\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(datetime.strptime(sorted_pks[0], self.fmt), tz=self.timezone)\n )\n )\n partition_key_time_windows: List[TimeWindow] = []\n for partition_key in sorted_pks:\n next_window = next(cur_windows_iterator)\n if next_window.start.strftime(self.fmt) == partition_key:\n partition_key_time_windows.append(next_window)\n else:\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n )\n )\n partition_key_time_windows.append(next(cur_windows_iterator))\n\n if validate:\n start_time_window = self.get_first_partition_window()\n end_time_window = self.get_last_partition_window()\n\n if start_time_window is None or end_time_window is None:\n check.failed("No partitions in the PartitionsDefinition")\n\n start_timestamp = start_time_window.start.timestamp()\n end_timestamp = end_time_window.end.timestamp()\n\n partition_key_time_windows = [\n tw\n for tw in partition_key_time_windows\n if tw.start.timestamp() >= start_timestamp and tw.end.timestamp() <= end_timestamp\n ]\n return partition_key_time_windows\n\n def start_time_for_partition_key(self, partition_key: str) -> datetime:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n # the datetime format might not include granular components, so we need to recover them\n # we make the assumption that the parsed partition key is <= the start datetime\n return next(iter(self._iterate_time_windows(partition_key_dt))).start\n\n def get_next_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[str]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n windows_iter = iter(self._iterate_time_windows(partition_key_dt))\n next(windows_iter)\n start_time = next(windows_iter).start\n if start_time >= last_partition_window.end:\n return None\n else:\n return start_time.strftime(self.fmt)\n\n def get_next_partition_window(\n self, end_dt: datetime, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n windows_iter = iter(self._iterate_time_windows(end_dt))\n next_window = next(windows_iter)\n if next_window.start >= last_partition_window.end:\n return None\n else:\n return next_window\n\n def get_prev_partition_window(self, start_dt: datetime) -> Optional[TimeWindow]:\n windows_iter = iter(self._reverse_iterate_time_windows(start_dt))\n prev_window = next(windows_iter)\n first_partition_window = self.get_first_partition_window()\n if first_partition_window is None or prev_window.start < first_partition_window.start:\n return None\n else:\n return prev_window\n\n @functools.lru_cache(maxsize=5)\n def _get_first_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n current_timestamp = current_time.timestamp()\n\n time_window = next(iter(self._iterate_time_windows(self.start)))\n\n if self.end_offset == 0:\n return time_window if time_window.end.timestamp() <= current_timestamp else None\n elif self.end_offset > 0:\n iterator = iter(self._iterate_time_windows(current_time))\n # first returned time window is time window of current time\n curr_window_plus_offset = next(iterator)\n for _ in range(self.end_offset):\n curr_window_plus_offset = next(iterator)\n return (\n time_window\n if time_window.end.timestamp() <= curr_window_plus_offset.start.timestamp()\n else None\n )\n else:\n # end offset < 0\n end_window = None\n iterator = iter(self._reverse_iterate_time_windows(current_time))\n for _ in range(abs(self.end_offset)):\n end_window = next(iterator)\n\n if end_window is None:\n check.failed("end_window should not be None")\n\n return (\n time_window if time_window.end.timestamp() <= end_window.start.timestamp() else None\n )\n\n def get_first_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_first_partition_window(current_time=current_time)\n\n @functools.lru_cache(maxsize=5)\n def _get_last_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n if self.get_first_partition_window(current_time) is None:\n return None\n\n current_time = (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n )\n\n if self.end and self.end < current_time:\n current_time = self.end\n\n if self.end_offset == 0:\n return next(iter(self._reverse_iterate_time_windows(current_time)))\n else:\n # TODO: make this efficient\n last_partition_key = super().get_last_partition_key(current_time)\n return (\n self.time_window_for_partition_key(last_partition_key)\n if last_partition_key\n else None\n )\n\n def get_last_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_last_partition_window(current_time=current_time)\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n first_window = self.get_first_partition_window(current_time)\n if first_window is None:\n return None\n\n return first_window.start.strftime(self.fmt)\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n last_window = self.get_last_partition_window(current_time)\n if last_window is None:\n return None\n\n return last_window.start.strftime(self.fmt)\n\n def end_time_for_partition_key(self, partition_key: str) -> datetime:\n return self.time_window_for_partition_key(partition_key).end\n\n @functools.lru_cache(maxsize=5)\n def get_partition_keys_in_time_window(self, time_window: TimeWindow) -> Sequence[str]:\n result: List[str] = []\n for partition_time_window in self._iterate_time_windows(time_window.start):\n if partition_time_window.start < time_window.end:\n result.append(partition_time_window.start.strftime(self.fmt))\n else:\n break\n return result\n\n def get_partition_key_range_for_time_window(self, time_window: TimeWindow) -> PartitionKeyRange:\n start_partition_key = self.get_partition_key_for_timestamp(time_window.start.timestamp())\n end_partition_key = self.get_partition_key_for_timestamp(\n cast(TimeWindow, self.get_prev_partition_window(time_window.end)).start.timestamp()\n )\n\n return PartitionKeyRange(start_partition_key, end_partition_key)\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n start_time = self.start_time_for_partition_key(partition_key_range.start)\n end_time = self.end_time_for_partition_key(partition_key_range.end)\n\n return self.get_partition_keys_in_time_window(TimeWindow(start_time, end_time))\n\n @public\n @property\n def schedule_type(self) -> Optional[ScheduleType]:\n """Optional[ScheduleType]: An enum representing the partition cadence (hourly, daily,\n weekly, or monthly).\n """\n if re.fullmatch(r"\\d+ \\* \\* \\* \\*", self.cron_schedule):\n return ScheduleType.HOURLY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\*", self.cron_schedule):\n return ScheduleType.DAILY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\d+", self.cron_schedule):\n return ScheduleType.WEEKLY\n elif re.fullmatch(r"\\d+ \\d+ \\d+ \\* \\*", self.cron_schedule):\n return ScheduleType.MONTHLY\n else:\n return None\n\n @public\n @property\n def minute_offset(self) -> int:\n """int: Number of minutes past the hour to "split" partitions. Defaults to 0.\n\n For example, returns 15 if each partition starts at 15 minutes past the hour.\n """\n match = re.fullmatch(r"(\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no minute offset")\n return int(match.groups()[0])\n\n @public\n @property\n def hour_offset(self) -> int:\n """int: Number of hours past 00:00 to "split" partitions. Defaults to 0.\n\n For example, returns 1 if each partition starts at 01:00.\n """\n match = re.fullmatch(r"(\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no hour offset")\n return int(match.groups()[1])\n\n @public\n @property\n def day_offset(self) -> int:\n """int: For a weekly or monthly partitions definition, returns the day to "split" partitions\n by. Each partition will start on this day, and end before this day in the following\n week/month. Returns 0 if the day_offset parameter is unset in the\n WeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.\n\n For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\n Saturday). Providing a value of 1 means that a partition will exist weekly from Monday to\n the following Sunday.\n\n For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\n last possible day of the month).\n """\n schedule_type = self.schedule_type\n if schedule_type == ScheduleType.WEEKLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[4])\n elif schedule_type == ScheduleType.MONTHLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[2])\n else:\n check.failed(f"Unsupported schedule type for day_offset: {schedule_type}")\n\n
[docs] @public\n def get_cron_schedule(\n self,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n ) -> str:\n """The schedule executes at the cadence specified by the partitioning, but may overwrite\n the minute/hour/day offset of the partitioning.\n\n This is useful e.g. if you have partitions that span midnight to midnight but you want to\n schedule a job that runs at 2 am.\n """\n if (\n minute_of_hour is None\n and hour_of_day is None\n and day_of_week is None\n and day_of_month is None\n ):\n return self.cron_schedule\n\n schedule_type = self.schedule_type\n if schedule_type is None:\n check.failed(\n f"{self.cron_schedule} does not support"\n " minute_of_hour/hour_of_day/day_of_week/day_of_month arguments"\n )\n\n minute_of_hour = cast(\n int,\n check.opt_int_param(minute_of_hour, "minute_of_hour", default=self.minute_offset),\n )\n\n if schedule_type == ScheduleType.HOURLY:\n check.invariant(\n hour_of_day is None, "Cannot set hour parameter with hourly partitions."\n )\n else:\n hour_of_day = cast(\n int, check.opt_int_param(hour_of_day, "hour_of_day", default=self.hour_offset)\n )\n\n if schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if schedule_type == ScheduleType.MONTHLY:\n default = self.day_offset or 1\n day_offset = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif schedule_type == ScheduleType.WEEKLY:\n default = self.day_offset or 0\n day_offset = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n day_offset = 0\n\n return cron_schedule_from_schedule_type_and_offsets(\n schedule_type,\n minute_offset=minute_of_hour,\n hour_offset=hour_of_day or 0,\n day_offset=day_offset,\n )
\n\n def _iterate_time_windows(self, start: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that start after the given start time."""\n start_timestamp = pendulum.instance(start, tz=self.timezone).timestamp()\n iterator = cron_string_iterator(\n start_timestamp=start_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n prev_time = next(iterator)\n while prev_time.timestamp() < start_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(prev_time, next_time)\n prev_time = next_time\n\n def _reverse_iterate_time_windows(self, end: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that end before the given end time."""\n end_timestamp = pendulum.instance(end, tz=self.timezone).timestamp()\n iterator = reverse_cron_string_iterator(\n end_timestamp=end_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n\n prev_time = next(iterator)\n while prev_time.timestamp() > end_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(next_time, prev_time)\n prev_time = next_time\n\n def get_partition_key_for_timestamp(self, timestamp: float, end_closed: bool = False) -> str:\n """Args:\n timestamp (float): Timestamp from the unix epoch, UTC.\n end_closed (bool): Whether the interval is closed at the end or at the beginning.\n """\n iterator = cron_string_iterator(\n timestamp, self.cron_schedule, self.timezone, start_offset=-1\n )\n # prev will be < timestamp\n prev = next(iterator)\n # prev_next will be >= timestamp\n prev_next = next(iterator)\n\n if end_closed or prev_next.timestamp() > timestamp:\n return prev.strftime(self.fmt)\n else:\n return prev_next.strftime(self.fmt)\n\n def less_than(self, partition_key1: str, partition_key2: str) -> bool:\n """Returns true if the partition_key1 is earlier than partition_key2."""\n return self.start_time_for_partition_key(\n partition_key1\n ) < self.start_time_for_partition_key(partition_key2)\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return TimeWindowPartitionsSubset\n\n def empty_subset(self) -> "PartitionsSubset":\n return self.partitions_subset_class.empty_subset(self)\n\n def is_valid_partition_key(self, partition_key: str) -> bool:\n try:\n partition_time = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return partition_time >= self.start\n except ValueError:\n return False\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(self.__repr__().encode("utf-8")).hexdigest()\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return bool(self._get_validated_time_window_for_partition_key(partition_key, current_time))
\n\n\n
[docs]class DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of daily partitions.\n\n The first partition in the set will start at the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset and/or hour_offset are used, the start and end times of\n each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n DailyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(DailyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\ndef wrap_time_window_run_config_fn(\n run_config_fn: Optional[Callable[[datetime, datetime], Mapping[str, Any]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, Any]]:\n def _run_config_wrapper(key: str) -> Mapping[str, Any]:\n if not run_config_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return run_config_fn(time_window.start, time_window.end)\n\n return _run_config_wrapper\n\n\ndef wrap_time_window_tags_fn(\n tags_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, str]]:\n def _tag_wrapper(key: str) -> Mapping[str, str]:\n if not tags_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return tags_fn(time_window.start, time_window.end)\n\n return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[DailyPartitionsDefinition],\n]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[DailyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[HourlyPartitionsDefinition],\n]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[HourlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[MonthlyPartitionsDefinition],\n]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[MonthlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[WeeklyPartitionsDefinition],\n]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[WeeklyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\nclass TimeWindowPartitionsSubset(PartitionsSubset):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self,\n partitions_def: TimeWindowPartitionsDefinition,\n num_partitions: int,\n included_time_windows: Optional[Sequence[TimeWindow]] = None,\n included_partition_keys: Optional[AbstractSet[str]] = None,\n ):\n self._partitions_def = check.inst_param(\n partitions_def, "partitions_def", TimeWindowPartitionsDefinition\n )\n self._included_time_windows = included_time_windows\n self._num_partitions = num_partitions\n\n check.param_invariant(\n not (included_partition_keys and included_time_windows),\n "Cannot specify both included_partition_keys and included_time_windows",\n )\n self._included_time_windows = check.opt_nullable_sequence_param(\n included_time_windows, "included_time_windows", of_type=TimeWindow\n )\n\n self._included_partition_keys = check.opt_nullable_set_param(\n included_partition_keys, "included_partition_keys", of_type=str\n )\n\n @property\n def included_time_windows(self) -> Sequence[TimeWindow]:\n if self._included_time_windows is None:\n result_time_windows, _ = self._add_partitions_to_time_windows(\n initial_windows=[],\n partition_keys=list(check.not_none(self._included_partition_keys)),\n validate=False,\n )\n self._included_time_windows = result_time_windows\n return self._included_time_windows\n\n def _get_partition_time_windows_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n ) -> Sequence[TimeWindow]:\n """Returns a list of partition time windows that are not in the subset.\n Each time window is a single partition.\n """\n first_tw = self._partitions_def.get_first_partition_window(current_time=current_time)\n last_tw = self._partitions_def.get_last_partition_window(current_time=current_time)\n\n if not first_tw or not last_tw:\n check.failed("No partitions found")\n\n if len(self.included_time_windows) == 0:\n return [TimeWindow(first_tw.start, last_tw.end)]\n\n time_windows = []\n if first_tw.start < self.included_time_windows[0].start:\n time_windows.append(TimeWindow(first_tw.start, self.included_time_windows[0].start))\n\n for i in range(len(self.included_time_windows) - 1):\n if self.included_time_windows[i].start >= last_tw.end:\n break\n if self.included_time_windows[i].end < last_tw.end:\n if self.included_time_windows[i + 1].start <= last_tw.end:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n self.included_time_windows[i + 1].start,\n )\n )\n else:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n last_tw.end,\n )\n )\n\n if last_tw.end > self.included_time_windows[-1].end:\n time_windows.append(TimeWindow(self.included_time_windows[-1].end, last_tw.end))\n\n return time_windows\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n partition_keys: List[str] = []\n for tw in self._get_partition_time_windows_not_in_subset(current_time):\n partition_keys.extend(self._partitions_def.get_partition_keys_in_time_window(tw))\n return partition_keys\n\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._included_partition_keys is None:\n return [\n pk\n for time_window in self.included_time_windows\n for pk in self._partitions_def.get_partition_keys_in_time_window(time_window)\n ]\n return list(self._included_partition_keys) if self._included_partition_keys else []\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [\n self._partitions_def.get_partition_key_range_for_time_window(window)\n for window in self.included_time_windows\n ]\n\n def _add_partitions_to_time_windows(\n self,\n initial_windows: Sequence[TimeWindow],\n partition_keys: Sequence[str],\n validate: bool = True,\n ) -> Tuple[Sequence[TimeWindow], int]:\n """Merges a set of partition keys into an existing set of time windows, returning the\n minimized set of time windows and the number of partitions added.\n """\n result_windows = [*initial_windows]\n time_windows = self._partitions_def.time_windows_for_partition_keys(\n frozenset(partition_keys), validate=validate\n )\n num_added_partitions = 0\n for window in sorted(time_windows):\n # go in reverse order because it's more common to add partitions at the end than the\n # beginning\n for i in reversed(range(len(result_windows))):\n included_window = result_windows[i]\n lt_end_of_range = window.start < included_window.end\n gte_start_of_range = window.start >= included_window.start\n\n if lt_end_of_range and gte_start_of_range:\n break\n\n if not lt_end_of_range:\n merge_with_range = included_window.end == window.start\n merge_with_later_range = i + 1 < len(result_windows) and (\n window.end == result_windows[i + 1].start\n )\n\n if merge_with_range and merge_with_later_range:\n result_windows[i] = TimeWindow(\n included_window.start, result_windows[i + 1].end\n )\n del result_windows[i + 1]\n elif merge_with_range:\n result_windows[i] = TimeWindow(included_window.start, window.end)\n elif merge_with_later_range:\n result_windows[i + 1] = TimeWindow(window.start, result_windows[i + 1].end)\n else:\n result_windows.insert(i + 1, window)\n\n num_added_partitions += 1\n break\n else:\n if result_windows and window.start == result_windows[0].start:\n result_windows[0] = TimeWindow(window.start, included_window.end) # type: ignore\n else:\n result_windows.insert(0, window)\n\n num_added_partitions += 1\n\n return result_windows, num_added_partitions\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "TimeWindowPartitionsSubset":\n # if we are representing things as a static set of keys, continue doing so\n if self._included_partition_keys is not None:\n new_partitions = {*self._included_partition_keys, *partition_keys}\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=len(new_partitions),\n included_partition_keys=new_partitions,\n )\n\n result_windows, added_partitions = self._add_partitions_to_time_windows(\n self.included_time_windows, list(partition_keys)\n )\n\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=self._num_partitions + added_partitions,\n included_time_windows=result_windows,\n )\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition, serialized: str\n ) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n\n loaded = json.loads(serialized)\n\n def tuples_to_time_windows(tuples):\n return [\n TimeWindow(\n pendulum.from_timestamp(tup[0], tz=partitions_def.timezone),\n pendulum.from_timestamp(tup[1], tz=partitions_def.timezone),\n )\n for tup in tuples\n ]\n\n if isinstance(loaded, list):\n # backwards compatibility\n time_windows = tuples_to_time_windows(loaded)\n num_partitions = sum(\n len(partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in time_windows\n )\n elif isinstance(loaded, dict) and (\n "version" not in loaded or loaded["version"] == cls.SERIALIZATION_VERSION\n ): # version 1\n time_windows = tuples_to_time_windows(loaded["time_windows"])\n num_partitions = loaded["num_partitions"]\n else:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {loaded.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n\n return TimeWindowPartitionsSubset(\n partitions_def, num_partitions=num_partitions, included_time_windows=time_windows\n )\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_unique_id:\n return (\n partitions_def.get_serializable_unique_identifier()\n == serialized_partitions_def_unique_id\n )\n\n if (\n serialized_partitions_def_class_name\n # note: all TimeWindowPartitionsDefinition subclasses will get serialized as raw\n # TimeWindowPartitionsDefinitions, so this class name check will not always pass,\n # hence the unique id check above\n and serialized_partitions_def_class_name != partitions_def.__class__.__name__\n ):\n return False\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n isinstance(data, dict)\n and data.get("time_windows") is not None\n and data.get("num_partitions") is not None\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n return cls(partitions_def, 0, [], set())\n\n def serialize(self) -> str:\n return json.dumps(\n {\n "version": self.SERIALIZATION_VERSION,\n "time_windows": [\n (window.start.timestamp(), window.end.timestamp())\n for window in self.included_time_windows\n ],\n "num_partitions": self._num_partitions,\n }\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition:\n return self._partitions_def\n\n def __eq__(self, other):\n return (\n isinstance(other, TimeWindowPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and (\n # faster comparison, but will not catch all cases\n (\n self._included_time_windows == other._included_time_windows\n and self._included_partition_keys == other._included_partition_keys\n )\n # slower comparison, catches all cases\n or self.included_time_windows == other.included_time_windows\n )\n )\n\n def __len__(self) -> int:\n return self._num_partitions\n\n def __contains__(self, partition_key: str) -> bool:\n if self._included_partition_keys is not None:\n return partition_key in self._included_partition_keys\n\n time_window = self._partitions_def.time_window_for_partition_key(partition_key)\n\n return any(\n time_window.start >= included_time_window.start\n and time_window.start < included_time_window.end\n for included_time_window in self.included_time_windows\n )\n\n def __repr__(self) -> str:\n return f"TimeWindowPartitionsSubset({self.get_partition_key_ranges()})"\n\n\nclass PartitionRangeStatus(Enum):\n MATERIALIZING = "MATERIALIZING"\n MATERIALIZED = "MATERIALIZED"\n FAILED = "FAILED"\n\n\nPARTITION_RANGE_STATUS_PRIORITY = [\n PartitionRangeStatus.MATERIALIZING,\n PartitionRangeStatus.FAILED,\n PartitionRangeStatus.MATERIALIZED,\n]\n\n\nclass PartitionTimeWindowStatus:\n def __init__(self, time_window: TimeWindow, status: PartitionRangeStatus):\n self.time_window = time_window\n self.status = status\n\n def __repr__(self):\n return f"({self.time_window.start} - {self.time_window.end}): {self.status.value}"\n\n def __eq__(self, other):\n return (\n isinstance(other, PartitionTimeWindowStatus)\n and self.time_window == other.time_window\n and self.status == other.status\n )\n\n\ndef _flatten(\n high_pri_time_windows: List[PartitionTimeWindowStatus],\n low_pri_time_windows: List[PartitionTimeWindowStatus],\n) -> List[PartitionTimeWindowStatus]:\n high_pri_time_windows = sorted(high_pri_time_windows, key=lambda t: t.time_window.start)\n low_pri_time_windows = sorted(low_pri_time_windows, key=lambda t: t.time_window.start)\n\n high_pri_idx = 0\n low_pri_idx = 0\n\n filtered_low_pri: List[PartitionTimeWindowStatus] = []\n\n # slice and dice the low pri time windows so there's no overlap with high pri\n while True:\n if low_pri_idx >= len(low_pri_time_windows):\n # reached end of materialized\n break\n if high_pri_idx >= len(high_pri_time_windows):\n # reached end of failed, add all remaining materialized bc there's no overlap\n filtered_low_pri.extend(low_pri_time_windows[low_pri_idx:])\n break\n\n low_pri_tw = low_pri_time_windows[low_pri_idx]\n high_pri_tw = high_pri_time_windows[high_pri_idx]\n\n if low_pri_tw.time_window.start < high_pri_tw.time_window.start:\n if low_pri_tw.time_window.end <= high_pri_tw.time_window.start:\n # low_pri_tw is entirely before high pri\n filtered_low_pri.append(low_pri_tw)\n low_pri_idx += 1\n else:\n # high pri cuts the low pri short\n filtered_low_pri.append(\n PartitionTimeWindowStatus(\n TimeWindow(\n low_pri_tw.time_window.start,\n high_pri_tw.time_window.start,\n ),\n low_pri_tw.status,\n )\n )\n\n if low_pri_tw.time_window.end > high_pri_tw.time_window.end:\n # the low pri time window will continue on the other end of the high pri\n # and get split in two. Modify low_pri[low_pri_idx] to be\n # the second half of the low pri time window. It will be added in the next iteration.\n # (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n else:\n # the rest of the low pri time window is inside the high pri time window\n low_pri_idx += 1\n else:\n if low_pri_tw.time_window.start >= high_pri_tw.time_window.end:\n # high pri is entirely before low pri. The next high pri may overlap\n high_pri_idx += 1\n elif low_pri_tw.time_window.end <= high_pri_tw.time_window.end:\n # low pri is entirely within high pri, skip it\n low_pri_idx += 1\n else:\n # high pri cuts out the start of the low pri. It will continue on the other end.\n # Modify low_pri[low_pri_idx] to shorten the start. It will be added\n # in the next iteration. (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n\n # combine the high pri windwos with the filtered low pri windows\n flattened_time_windows = high_pri_time_windows\n flattened_time_windows.extend(filtered_low_pri)\n flattened_time_windows.sort(key=lambda t: t.time_window.start)\n return flattened_time_windows\n\n\ndef fetch_flattened_time_window_ranges(\n subsets: Mapping[PartitionRangeStatus, TimeWindowPartitionsSubset]\n) -> Sequence[PartitionTimeWindowStatus]:\n """Given potentially overlapping subsets, return a flattened list of timewindows where the highest priority status wins\n on overlaps.\n """\n prioritized_subsets = sorted(\n [(status, subset) for status, subset in subsets.items()],\n key=lambda t: PARTITION_RANGE_STATUS_PRIORITY.index(t[0]),\n )\n\n # progressively add lower priority time windows to the list of higher priority time windows\n flattened_time_window_statuses = []\n for status, subset in prioritized_subsets:\n subset_time_window_statuses = [\n PartitionTimeWindowStatus(tw, status) for tw in subset.included_time_windows\n ]\n flattened_time_window_statuses = _flatten(\n flattened_time_window_statuses, subset_time_window_statuses\n )\n\n return flattened_time_window_statuses\n\n\ndef has_one_dimension_time_window_partitioning(\n partitions_def: Optional[PartitionsDefinition],\n) -> bool:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return True\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n time_window_dims = [\n dim\n for dim in partitions_def.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_window_dims) == 1:\n return True\n\n return False\n\n\ndef get_time_partitions_def(\n partitions_def: Optional[PartitionsDefinition],\n) -> Optional[TimeWindowPartitionsDefinition]:\n """For a given PartitionsDefinition, return the associated TimeWindowPartitionsDefinition if it\n exists.\n """\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None:\n return None\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partitions_def\n elif isinstance(\n partitions_def, MultiPartitionsDefinition\n ) and has_one_dimension_time_window_partitioning(partitions_def):\n return cast(\n TimeWindowPartitionsDefinition, partitions_def.time_window_dimension.partitions_def\n )\n else:\n return None\n\n\ndef get_time_partition_key(\n partitions_def: Optional[PartitionsDefinition], partition_key: Optional[str]\n) -> str:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None or partition_key is None:\n check.failed(\n "Cannot get time partitions key from when partitions def is None or partition key is"\n " None"\n )\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partition_key\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n return partitions_def.get_partition_key_from_str(partition_key).keys_by_dimension[\n partitions_def.time_window_dimension.name\n ]\n else:\n check.failed(f"Cannot get time partition from non-time partitions def {partitions_def}")\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partitions"}, "unresolved_asset_job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.unresolved_asset_job_definition

\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, NamedTuple, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions import AssetKey\nfrom dagster._core.definitions.run_request import RunRequest\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\n\nfrom .asset_layer import build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .metadata import RawMetadataValue\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import (\n        AssetSelection,\n        ExecutorDefinition,\n        HookDefinition,\n        JobDefinition,\n        PartitionedConfig,\n        PartitionsDefinition,\n        ResourceDefinition,\n    )\n    from dagster._core.definitions.asset_graph import InternalAssetGraph\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.run_config import RunConfig\n\n\nclass UnresolvedAssetJobDefinition(\n    NamedTuple(\n        "_UnresolvedAssetJobDefinition",\n        [\n            ("name", str),\n            ("selection", "AssetSelection"),\n            (\n                "config",\n                Optional[Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig"]],\n            ),\n            ("description", Optional[str]),\n            ("tags", Optional[Mapping[str, Any]]),\n            ("metadata", Optional[Mapping[str, RawMetadataValue]]),\n            ("partitions_def", Optional["PartitionsDefinition"]),\n            ("executor_def", Optional["ExecutorDefinition"]),\n            ("hooks", Optional[AbstractSet["HookDefinition"]]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        selection: "AssetSelection",\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n        ] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet["HookDefinition"]] = None,\n    ):\n        from dagster._core.definitions import (\n            AssetSelection,\n            ExecutorDefinition,\n            HookDefinition,\n            PartitionsDefinition,\n        )\n        from dagster._core.definitions.run_config import convert_config_input\n\n        return super(UnresolvedAssetJobDefinition, cls).__new__(\n            cls,\n            name=check.str_param(name, "name"),\n            selection=check.inst_param(selection, "selection", AssetSelection),\n            config=convert_config_input(config),\n            description=check.opt_str_param(description, "description"),\n            tags=check.opt_mapping_param(tags, "tags"),\n            metadata=check.opt_mapping_param(metadata, "metadata"),\n            partitions_def=check.opt_inst_param(\n                partitions_def, "partitions_def", PartitionsDefinition\n            ),\n            executor_def=check.opt_inst_param(executor_def, "partitions_def", ExecutorDefinition),\n            hooks=check.opt_nullable_set_param(hooks, "hooks", of_type=HookDefinition),\n        )\n\n    @deprecated(\n        breaking_version="2.0.0",\n        additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n    )\n    def run_request_for_partition(\n        self,\n        partition_key: str,\n        run_key: Optional[str] = None,\n        tags: Optional[Mapping[str, str]] = None,\n        asset_selection: Optional[Sequence[AssetKey]] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n        current_time: Optional[datetime] = None,\n        dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n    ) -> RunRequest:\n        """Creates a RunRequest object for a run that processes the given partition.\n\n        Args:\n            partition_key: The key of the partition to request a run for.\n            run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n                only one run is created per run key across all sensor evaluations.  For schedules,\n                ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n                value means that a run will always be launched per evaluation.\n            tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n                to the launched run.\n            run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n                a :py:class:`PartitionedConfig`, this value will override replace the config\n                provided by it.\n            current_time (Optional[datetime]): Used to determine which time-partitions exist.\n                Defaults to now.\n            dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n                object that is responsible for fetching dynamic partitions. Required when the\n                partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n                can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n        Returns:\n            RunRequest: an object that requests a run to process the given partition.\n        """\n        from dagster._core.definitions.partition import (\n            DynamicPartitionsDefinition,\n            PartitionedConfig,\n        )\n\n        if not self.partitions_def:\n            check.failed("Called run_request_for_partition on a non-partitioned job")\n\n        partitioned_config = PartitionedConfig.from_flexible_config(\n            self.config, self.partitions_def\n        )\n\n        if (\n            isinstance(self.partitions_def, DynamicPartitionsDefinition)\n            and self.partitions_def.name\n        ):\n            # Do not support using run_request_for_partition with dynamic partitions,\n            # since this requires querying the instance once per run request for the\n            # existent dynamic partitions\n            check.failed(\n                "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n                " RunRequest(partition_key=...)"\n            )\n\n        self.partitions_def.validate_partition_key(\n            partition_key,\n            current_time=current_time,\n            dynamic_partitions_store=dynamic_partitions_store,\n        )\n\n        run_config = (\n            run_config\n            if run_config is not None\n            else partitioned_config.get_run_config_for_partition_key(partition_key)\n        )\n        run_request_tags = {\n            **(tags or {}),\n            **partitioned_config.get_tags_for_partition_key(partition_key),\n        }\n\n        return RunRequest(\n            job_name=self.name,\n            run_key=run_key,\n            run_config=run_config,\n            tags=run_request_tags,\n            asset_selection=asset_selection,\n            partition_key=partition_key,\n        )\n\n    def resolve(\n        self,\n        asset_graph: "InternalAssetGraph",\n        default_executor_def: Optional["ExecutorDefinition"] = None,\n        resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n    ) -> "JobDefinition":\n        """Resolve this UnresolvedAssetJobDefinition into a JobDefinition."""\n        assets = asset_graph.assets\n        source_assets = asset_graph.source_assets\n        selected_asset_keys = self.selection.resolve(asset_graph)\n        selected_asset_checks = self.selection.resolve_checks(asset_graph)\n\n        asset_keys_by_partitions_def = defaultdict(set)\n        for asset_key in selected_asset_keys:\n            partitions_def = asset_graph.get_partitions_def(asset_key)\n            if partitions_def is not None:\n                asset_keys_by_partitions_def[partitions_def].add(asset_key)\n\n        if len(asset_keys_by_partitions_def) > 1:\n            keys_by_partitions_def_str = "\\n".join(\n                f"{partitions_def}: {asset_keys}"\n                for partitions_def, asset_keys in asset_keys_by_partitions_def.items()\n            )\n            raise DagsterInvalidDefinitionError(\n                f"Multiple partitioned assets exist in assets job '{self.name}'. Selected assets"\n                " must have the same partitions definitions, but the selected assets have"\n                f" different partitions definitions: \\n{keys_by_partitions_def_str}"\n            )\n\n        inferred_partitions_def = (\n            next(iter(asset_keys_by_partitions_def.keys()))\n            if asset_keys_by_partitions_def\n            else None\n        )\n        if (\n            inferred_partitions_def\n            and self.partitions_def != inferred_partitions_def\n            and self.partitions_def is not None\n        ):\n            raise DagsterInvalidDefinitionError(\n                f"Job '{self.name}' received a partitions_def of {self.partitions_def}, but the"\n                f" selected assets {next(iter(asset_keys_by_partitions_def.values()))} have a"\n                f" non-matching partitions_def of {inferred_partitions_def}"\n            )\n\n        return build_asset_selection_job(\n            name=self.name,\n            assets=assets,\n            asset_checks=asset_graph.asset_checks,\n            config=self.config,\n            source_assets=source_assets,\n            description=self.description,\n            tags=self.tags,\n            metadata=self.metadata,\n            asset_selection=selected_asset_keys,\n            asset_check_selection=selected_asset_checks,\n            partitions_def=self.partitions_def if self.partitions_def else inferred_partitions_def,\n            executor_def=self.executor_def or default_executor_def,\n            hooks=self.hooks,\n            resource_defs=resource_defs,\n        )\n\n\n
[docs]def define_asset_job(\n name: str,\n selection: Optional["CoercibleToAssetSelection"] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet["HookDefinition"]] = None,\n) -> UnresolvedAssetJobDefinition:\n """Creates a definition of a job which will either materialize a selection of assets or observe\n a selection of source assets. This will only be resolved to a JobDefinition once placed in a\n code location.\n\n Args:\n name (str):\n The name for the job.\n selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]):\n The assets that will be materialized or observed when the job is run.\n\n The selected assets must all be included in the assets that are passed to the assets\n argument of the Definitions object that this job is included on.\n\n The string "my_asset*" selects my_asset and all downstream assets within the code\n location. A list of strings represents the union of all assets selected by strings\n within the list.\n\n The selection will be resolved to a set of assets when the location is loaded. If the\n selection resolves to all source assets, the created job will perform source asset\n observations. If the selection resolves to all regular assets, the created job will\n materialize assets. If the selection resolves to a mixed set of source assets and\n regular assets, an error will be thrown.\n\n config:\n Describes how the Job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]): Arbitrary metadata about the job.\n Keys are displayed string labels, and values are one of the following: string, float,\n int, JSON-serializable dict, JSON-serializable list, and one of the data classes\n returned by a MetadataValue static method.\n description (Optional[str]):\n A description for the Job.\n partitions_def (Optional[PartitionsDefinition]):\n Defines the set of partitions for this job. All AssetDefinitions selected for this job\n must have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\n PartitionsDefinition will be inferred from the selected AssetDefinitions.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n\n\n Returns:\n UnresolvedAssetJobDefinition: The job, which can be placed inside a code location.\n\n Examples:\n .. code-block:: python\n\n # A job that targets all assets in the code location:\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n )\n\n # A job that targets a single asset\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets", selection=[asset1])],\n )\n\n # A job that targets all the assets in a group:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n )\n\n @observable_source_asset\n def source_asset():\n ...\n\n # A job that observes a source asset:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("observation_job", selection=[source_asset])],\n )\n\n # Resources are supplied to the assets, not the job:\n @asset(required_resource_keys={"slack_client"})\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n resources={"slack_client": prod_slack_client},\n )\n\n """\n from dagster._core.definitions import AssetSelection\n\n # convert string-based selections to AssetSelection objects\n if selection is None:\n resolved_selection = AssetSelection.all()\n else:\n resolved_selection = AssetSelection.from_coercible(selection)\n\n return UnresolvedAssetJobDefinition(\n name=name,\n selection=resolved_selection,\n config=config,\n description=description,\n tags=tags,\n metadata=metadata,\n partitions_def=partitions_def,\n executor_def=executor_def,\n hooks=hooks,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/unresolved_asset_job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.unresolved_asset_job_definition"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast\n\nimport yaml\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.storage.tags import check_reserved_tags\nfrom dagster._utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\nDEFAULT_GROUP_NAME = "default"  # asset group_name used when none is provided\nDEFAULT_IO_MANAGER_KEY = "io_manager"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None."""\n\n\ndef has_valid_name_chars(name: str) -> bool:\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str, allow_list: Optional[List[str]] = None) -> str:\n    check.str_param(name, "name")\n\n    if allow_list and name in allow_list:\n        return name\n\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python'\n            " reserved keyword."\n        )\n\n    check_valid_chars(name)\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef check_valid_chars(name: str):\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex'\n            f" {VALID_NAME_REGEX_STR}."\n        )\n\n\ndef is_valid_name(name: str) -> bool:\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key: object, value: object) -> str:\n    return f'{key}="{value!r}"'\n\n\ndef struct_to_string(name: str, **kwargs: object) -> str:\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return f"{name}({props_str})"\n\n\ndef validate_tags(\n    tags: Optional[Mapping[str, Any]], allow_reserved_tags: bool = True\n) -> Mapping[str, str]:\n    valid_tags: Dict[str, str] = {}\n    for key, value in check.opt_mapping_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = f'Could not JSON encode value "{value}"'\n            str_val = None\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = (\n                    'JSON encoding "{json}" of value "{val}" is not equivalent to original value'\n                    .format(json=str_val, val=value)\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    f'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value."\n                )\n\n            valid_tags[key] = str_val  # type: ignore  # (possible none)\n        else:\n            valid_tags[key] = value\n\n    if not allow_reserved_tags:\n        check_reserved_tags(valid_tags)\n\n    return valid_tags\n\n\ndef validate_group_name(group_name: Optional[str]) -> str:\n    """Ensures a string name is valid and returns a default if no name provided."""\n    if group_name:\n        check_valid_chars(group_name)\n        return group_name\n    return DEFAULT_GROUP_NAME\n\n\n
[docs]def config_from_files(config_files: Sequence[str]) -> Mapping[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_sequence_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n f'File or glob pattern "{file_glob}" for "config_files" produced no results.'\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: Sequence[str]) -> Mapping[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.sequence_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: Sequence[Tuple[str, str]]) -> Mapping[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n import pkg_resources # expensive, import only on use\n\n pkg_resource_defs = check.sequence_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/_core/definitions/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nfrom dagster._annotations import public\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n\n\n
[docs]class OpVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for an op.\n\n Attributes:\n op_def (OpDefinition): The definition of the op to compute a version for.\n op_config (Any): The parsed config to be passed to the op during execution.\n """\n\n op_def: "OpDefinition"\n op_config: Any
\n\n\n
[docs]class ResourceVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for a resource.\n\n Attributes:\n resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n resource_config (Any): The parsed config to be passed to the resource during execution.\n """\n\n resource_def: "ResourceDefinition"\n resource_config: Any
\n\n\n
[docs]class VersionStrategy(ABC):\n """Abstract class for defining a strategy to version ops and resources.\n\n When subclassing, `get_op_version` must be implemented, and\n `get_resource_version` can be optionally implemented.\n\n `get_op_version` should ingest an OpVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called\n a `version`, which will\n be tagged to outputs of that op in the job. Providing a\n `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose\n outputs do not have an up-to-date version will run.\n """\n\n
[docs] @public\n @abstractmethod\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n """VersionStrategy that checks for changes to the source code of ops and resources.\n\n Only checks for changes within the immediate body of the op/resource's\n decorated function (or compute function, if the op/resource was\n constructed directly from a definition).\n """\n\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n
[docs] @public\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op by hashing its source code.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource by hashing its source code.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/_core/definitions/version_strategy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster._core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, Type\n\nimport dagster._check as check\nfrom dagster._utils.interrupts import raise_interrupts_as\n\nif TYPE_CHECKING:\n    from dagster._core.log_manager import DagsterLogManager\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions.\n """\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidObservationError(DagsterError):\n """Indicates that an invalid value was returned from a source asset observation function."""\n\n\n
[docs]class DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """
\n\n\nclass DagsterInvalidDeserializationVersionError(DagsterError):\n """Indicates that a serialized value has an unsupported version and cannot be deserialized."""\n\n\nPYTHONIC_CONFIG_ERROR_VERBIAGE = """\nThis config type can be a:\n - Python primitive type\n - int, float, bool, str, list\n - A Python Dict or List type containing other valid types\n - Custom data classes extending dagster.Config\n - A Pydantic discriminated union type (https://docs.pydantic.dev/usage/types/#discriminated-unions-aka-tagged-unions)\n"""\n\nPYTHONIC_RESOURCE_ADDITIONAL_TYPES = """\n\nIf this config type represents a resource dependency, its annotation must either:\n - Extend dagster.ConfigurableResource, dagster.ConfigurableIOManager, or\n - Be wrapped in a ResourceDependency annotation, e.g. ResourceDependency[{invalid_type_str}]\n"""\n\n\ndef _generate_pythonic_config_error_message(\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n) -> str:\n invalid_type_name = getattr(invalid_type, "__name__", "<my type>")\n pythonic_config_error_verbiage = (\n PYTHONIC_CONFIG_ERROR_VERBIAGE + (PYTHONIC_RESOURCE_ADDITIONAL_TYPES if is_resource else "")\n ).format(invalid_type_str=invalid_type_name)\n\n return ("""\nError defining Dagster config class{config_class}{field_name}.\nUnable to resolve config type {invalid_type} to a supported Dagster config type.\n\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""").format(\n config_class=f" {config_class!r}" if config_class else "",\n field_name=f" on field '{field_name}'" if field_name else "",\n invalid_type=repr(invalid_type),\n PYTHONIC_CONFIG_ERROR_VERBIAGE=pythonic_config_error_verbiage,\n )\n\n\nclass DagsterInvalidPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with an invalid value."""\n\n def __init__(\n self,\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n **kwargs,\n ):\n self.invalid_type = invalid_type\n self.field_name = field_name\n self.config_class = config_class\n super(DagsterInvalidPythonicConfigDefinitionError, self).__init__(\n _generate_pythonic_config_error_message(\n config_class=config_class,\n field_name=field_name,\n invalid_type=invalid_type,\n is_resource=is_resource,\n ),\n **kwargs,\n )\n\n\nclass DagsterInvalidDagsterTypeInPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with a DagsterType\n annotated field.\n """\n\n def __init__(\n self,\n config_class_name: str,\n field_name: Optional[str],\n **kwargs,\n ):\n self.field_name = field_name\n super(DagsterInvalidDagsterTypeInPythonicConfigDefinitionError, self).__init__(\n f"""Error defining Dagster config class '{config_class_name}' on field '{field_name}'. DagsterTypes cannot be used to annotate a config type. DagsterType is meant only for type checking and coercion in op and asset inputs and outputs.\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""",\n **kwargs,\n )\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value.\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}"\n + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=f" Reason: {reason}." if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime.\n """
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available.\n """\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts() -> Iterator[None]:\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(\n error_cls: Type["DagsterUserCodeExecutionError"],\n msg_fn: Callable[[], str],\n log_manager: Optional["DagsterLogManager"] = None,\n **kwargs: object,\n) -> Iterator[None]:\n """Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e:\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self) -> bool:\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n f"Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n )\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema).\n """\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster._config import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += f"\\n Error {i_error + 1}: {error.message}"\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster._core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with.\n """\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterMaxRetriesExceededError(DagsterError):\n """Raised when raise_on_error is true, and retries were exceeded, this error should be raised."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterMaxRetriesExceededError, self).__init__(*args, **kwargs)\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterMaxRetriesExceededError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n\nclass DagsterCodeLocationNotFoundError(DagsterError):\n pass\n\n\nclass DagsterCodeLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterCodeLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id."""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata=None, dagster_type=None):\n from dagster import DagsterType\n from dagster._core.definitions.metadata import normalize_metadata\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\nclass DagsterAssetCheckFailedError(DagsterError):\n """Indicates than an asset check failed."""\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n f"Event logs invalid for run id {run_id}"\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key."""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters."""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass FreshnessPolicySensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined freshness policy sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state."""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes.\n """\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """The user has tried to access run config for a partition name that does not exist."""\n\n\nclass DagsterUndefinedDataVersionError(DagsterError):\n """The user attempted to retrieve the most recent logical version for an asset, but no logical version is defined."""\n\n\nclass DagsterAssetBackfillDataLoadError(DagsterError):\n """Indicates that an asset backfill is now unloadable. May happen when (1) a code location containing\n targeted assets is unloadable or (2) and asset or an asset's partitions definition has been removed.\n """\n\n\nclass DagsterDefinitionChangedDeserializationError(DagsterError):\n """Indicates that a stored value can't be deserialized because the definition needed to interpret\n it has changed.\n """\n\n\nclass DagsterPipesExecutionError(DagsterError):\n """Indicates that an error occurred during the execution of an external process."""\n
", "current_page_name": "_modules/dagster/_core/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.errors"}, "event_api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.event_api

\nfrom datetime import datetime\nfrom typing import Callable, Mapping, NamedTuple, Optional, Sequence, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.errors import DagsterInvalidInvocationError\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._serdes import whitelist_for_serdes\n\nEventHandlerFn: TypeAlias = Callable[[EventLogEntry, str], None]\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards.\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]@whitelist_for_serdes\nclass EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not instantiate this class directly.\n """\n\n storage_id: PublicAttr[int]\n event_log_entry: PublicAttr[EventLogEntry]\n\n @property\n def run_id(self) -> str:\n return self.event_log_entry.run_id\n\n @property\n def timestamp(self) -> float:\n return self.event_log_entry.timestamp\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.asset_key\n\n return None\n\n @property\n def partition_key(self) -> Optional[str]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.partition\n\n return None\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n return self.event_log_entry.asset_materialization\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n return self.event_log_entry.asset_observation
\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", DagsterEventType),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[Sequence[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ("storage_ids", Optional[Sequence[int]]),\n ("tags", Optional[Mapping[str, Union[str, Sequence[str]]]]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (DagsterEventType): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: DagsterEventType,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[Sequence[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n storage_ids: Optional[Sequence[int]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n ):\n check.opt_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.inst_param(event_type, "event_type", DagsterEventType)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n if tags and event_type is not DagsterEventType.ASSET_MATERIALIZATION:\n raise DagsterInvalidInvocationError(\n "Can only filter by tags for asset materialization events"\n )\n\n # type-ignores work around mypy type inference bug\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=event_type,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n storage_ids=check.opt_nullable_sequence_param(storage_ids, "storage_ids", of_type=int),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/event_api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.event_api"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nimport sys\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    NodeHandle,\n)\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster._core.definitions.metadata import (\n    MetadataFieldSerializer,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import HookExecutionError\nfrom dagster._core.execution.context.system import IPlanContext, IStepContext, StepExecutionContext\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.inputs import StepInputData\nfrom dagster._core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\nfrom dagster._core.execution.plan.outputs import StepOutputData\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.serdes import UnpackContext\nfrom dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster._utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import ObjectStoreOperation\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.step import StepKind\n\n\nEventSpecificData = Union[\n    StepOutputData,\n    StepFailureData,\n    StepSuccessData,\n    "StepMaterializationData",\n    "StepExpectationResultData",\n    StepInputData,\n    "EngineEventData",\n    "HookErroredData",\n    StepRetryData,\n    "JobFailureData",\n    "JobCanceledData",\n    "ObjectStoreOperationResultData",\n    "HandledOutputData",\n    "LoadedInputData",\n    "ComputeLogsCaptureData",\n    "AssetObservationData",\n    "AssetMaterializationPlannedData",\n    "AssetCheckEvaluation",\n    "AssetCheckEvaluationPlanned",\n]\n\n\n
[docs]class DagsterEventType(str, Enum):\n """The types of events that may be yielded by op and job execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n # The process carrying out step execution is starting/started. Shown as a\n # marker start/end in the Dagster UI.\n STEP_WORKER_STARTING = "STEP_WORKER_STARTING"\n STEP_WORKER_STARTED = "STEP_WORKER_STARTED"\n\n # Resource initialization for execution has started/succeede/failed. Shown\n # as a marker start/end in the Dagster UI.\n RESOURCE_INIT_STARTED = "RESOURCE_INIT_STARTED"\n RESOURCE_INIT_SUCCESS = "RESOURCE_INIT_SUCCESS"\n RESOURCE_INIT_FAILURE = "RESOURCE_INIT_FAILURE"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n ASSET_CHECK_EVALUATION_PLANNED = "ASSET_CHECK_EVALUATION_PLANNED"\n ASSET_CHECK_EVALUATION = "ASSET_CHECK_EVALUATION"\n\n # We want to display RUN_* events in the Dagster UI and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\nMARKER_EVENTS = {\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: DagsterRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: DagsterRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: DagsterRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: DagsterRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: DagsterRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: DagsterRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: DagsterRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\nASSET_CHECK_EVENTS = {\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str,\n expected_type: Union[DagsterEventType, Sequence[DagsterEventType]],\n actual_type: DagsterEventType,\n) -> None:\n _expected_type = (\n [expected_type] if isinstance(expected_type, DagsterEventType) else expected_type\n )\n check.invariant(\n actual_type in _expected_type,\n f"{method} only callable when event_type is"\n f" {','.join([t.value for t in _expected_type])}, called on {actual_type}",\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type in (\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n ):\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluationPlanned)\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluation)\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_job_event(job_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n job_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {job_context.job_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\nclass DagsterEventSerializer(NamedTupleSerializer["DagsterEvent"]):\n def before_unpack(self, context, unpacked_dict: Any) -> Dict[str, Any]:\n event_type_value, event_specific_data = _handle_back_compat(\n unpacked_dict["event_type_value"], unpacked_dict.get("event_specific_data")\n )\n unpacked_dict["event_type_value"] = event_type_value\n unpacked_dict["event_specific_data"] = event_specific_data\n\n return unpacked_dict\n\n def handle_unpack_error(\n self,\n exc: Exception,\n context: UnpackContext,\n storage_dict: Dict[str, Any],\n ) -> "DagsterEvent":\n event_type_value, _ = _handle_back_compat(\n storage_dict["event_type_value"], storage_dict.get("event_specific_data")\n )\n step_key = storage_dict.get("step_key")\n orig_message = storage_dict.get("message")\n new_message = (\n f"Could not deserialize event of type {event_type_value}. This event may have been"\n " written by a newer version of Dagster."\n + (f' Original message: "{orig_message}"' if orig_message else "")\n )\n return DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=storage_dict["pipeline_name"],\n message=new_message,\n step_key=step_key,\n event_specific_data=EngineEventData(\n error=serializable_error_info_from_exc_info(sys.exc_info())\n ),\n )\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterEventSerializer,\n storage_field_names={\n "node_handle": "solid_handle",\n "job_name": "pipeline_name",\n },\n)\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("job_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("node_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Mapping[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by op and job execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n job_name (str)\n node_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_job(\n event_type: DagsterEventType,\n job_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_context.job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_job_event(job_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n event_type: DagsterEventType,\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n job_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n node_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Mapping[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n # old events may contain node_handle but not step_handle\n if node_handle is not None and step_handle is None:\n step_handle = StepHandle(node_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(job_name, "job_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(node_handle, "node_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_mapping_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def node_name(self) -> str:\n check.invariant(self.node_handle is not None)\n node_handle = cast(NodeHandle, self.node_handle)\n return node_handle.name\n\n @public\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @public\n @property\n def is_step_event(self) -> bool:\n """bool: If this event relates to a specific step."""\n return self.event_type in STEP_EVENTS\n\n @public\n @property\n def is_hook_event(self) -> bool:\n """bool: If this event relates to the execution of a hook."""\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster._core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @public\n @property\n def is_step_success(self) -> bool:\n """bool: If this event is of type STEP_SUCCESS."""\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @public\n @property\n def is_successful_output(self) -> bool:\n """bool: If this event is of type STEP_OUTPUT."""\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @public\n @property\n def is_step_start(self) -> bool:\n """bool: If this event is of type STEP_START."""\n return self.event_type == DagsterEventType.STEP_START\n\n @public\n @property\n def is_step_failure(self) -> bool:\n """bool: If this event is of type STEP_FAILURE."""\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @public\n @property\n def is_resource_init_failure(self) -> bool:\n """bool: If this event is of type RESOURCE_INIT_FAILURE."""\n return self.event_type == DagsterEventType.RESOURCE_INIT_FAILURE\n\n @public\n @property\n def is_step_skipped(self) -> bool:\n """bool: If this event is of type STEP_SKIPPED."""\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @public\n @property\n def is_step_up_for_retry(self) -> bool:\n """bool: If this event is of type STEP_UP_FOR_RETRY."""\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @public\n @property\n def is_step_restarted(self) -> bool:\n """bool: If this event is of type STEP_RESTARTED."""\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_job_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_job_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_run_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this event represents the failure of a run or step."""\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_job_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @public\n @property\n def is_engine_event(self) -> bool:\n """bool: If this event is of type ENGINE_EVENT."""\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @public\n @property\n def is_handled_output(self) -> bool:\n """bool: If this event is of type HANDLED_OUTPUT."""\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @public\n @property\n def is_loaded_input(self) -> bool:\n """bool: If this event is of type LOADED_INPUT."""\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @public\n @property\n def is_step_materialization(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @public\n @property\n def is_expectation_result(self) -> bool:\n """bool: If this event is of type STEP_EXPECTATION_RESULT."""\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @public\n @property\n def is_asset_observation(self) -> bool:\n """bool: If this event is of type ASSET_OBSERVATION."""\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @public\n @property\n def is_asset_materialization_planned(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION_PLANNED."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED\n\n @public\n @property\n def asset_key(self) -> Optional[AssetKey]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n asset key. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @public\n @property\n def partition(self) -> Optional[str]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n partition. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def asset_check_planned_data(self) -> "AssetCheckEvaluationPlanned":\n _assert_type(\n "asset_check_planned",\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n self.event_type,\n )\n return cast(AssetCheckEvaluationPlanned, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def materialization(self) -> AssetMaterialization:\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data).materialization\n\n @property\n def asset_check_evaluation_data(self) -> AssetCheckEvaluation:\n _assert_type(\n "asset_check_evaluation", DagsterEventType.ASSET_CHECK_EVALUATION, self.event_type\n )\n return cast(AssetCheckEvaluation, self.event_specific_data)\n\n @property\n def job_failure_data(self) -> "JobFailureData":\n _assert_type("job_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(JobFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type(\n "engine_event_data",\n [\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.STEP_WORKER_STARTING,\n ],\n self.event_type,\n )\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self) -> "ComputeLogsCaptureData":\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return cast(ComputeLogsCaptureData, self.event_specific_data)\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n output_def = step_context.op.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message=(\n 'Yielded output "{output_name}"{mapping_clause} of type'\n ' "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check)."\n ),\n mapping_clause=(\n f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext,\n step_failure_data: "StepFailureData",\n message=None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message=(message or f'Execution of step "{step_context.step.key}" failed.'),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message=(\n 'Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=(\n f" in {step_retry_data.seconds_to_wait} seconds"\n if step_retry_data.seconds_to_wait\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n input_def = step_context.op_def.input_def_named(step_input_data.input_name)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check)."\n ),\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message=f'Started execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message=f'Skipped execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: AssetMaterialization,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization),\n message=(\n materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=f" {materialization.label}" if materialization.label else ""\n )\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def asset_check_evaluation(\n step_context: IStepContext, asset_check_evaluation: AssetCheckEvaluation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_CHECK_EVALUATION,\n step_context=step_context,\n event_specific_data=asset_check_evaluation,\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def job_start(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_START,\n job_context,\n message=f'Started execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_success(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_SUCCESS,\n job_context,\n message=f'Finished execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_failure(\n job_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(job_context_or_name, IPlanContext):\n return DagsterEvent.from_job(\n DagsterEventType.RUN_FAILURE,\n job_context_or_name,\n message=(\n f'Execution of run for "{job_context_or_name.job_name}" failed. {context_msg}'\n ),\n event_specific_data=JobFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the job_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(job_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n job_name=job_context_or_name,\n event_specific_data=JobFailureData(error_info),\n message=f'Execution of run for "{job_context_or_name}" failed. {context_msg}',\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def job_canceled(\n job_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_CANCELED,\n job_context,\n message=f'Execution of run for "{job_context.job_name}" canceled.',\n event_specific_data=JobCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def step_worker_starting(\n step_context: IStepContext,\n message: str,\n metadata: Mapping[str, MetadataValue],\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n DagsterEventType.STEP_WORKER_STARTING,\n step_context,\n message=message,\n event_specific_data=EngineEventData(\n metadata=metadata, marker_start="step_process_start"\n ),\n )\n\n @staticmethod\n def step_worker_started(\n log_manager: DagsterLogManager,\n job_name: str,\n message: str,\n metadata: Mapping[str, MetadataValue],\n step_key: Optional[str],\n ) -> "DagsterEvent":\n event = DagsterEvent(\n DagsterEventType.STEP_WORKER_STARTED.value,\n job_name=job_name,\n message=message,\n event_specific_data=EngineEventData(metadata=metadata, marker_end="step_process_start"),\n pid=os.getpid(),\n step_key=step_key,\n )\n log_manager.log_dagster_event(\n level=logging.DEBUG,\n msg=message,\n dagster_event=event,\n )\n return event\n\n @staticmethod\n def resource_init_start(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_STARTED,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata={}, marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Mapping[str, Any],\n resource_init_times: Mapping[str, str],\n ) -> "DagsterEvent":\n metadata = {}\n for key in resource_instances.keys():\n metadata[key] = MetadataValue.python_artifact(resource_instances[key].__class__)\n metadata[f"{key}:init_time"] = resource_init_times[key]\n\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata=metadata,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_FAILURE,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.ENGINE_EVENT,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n plan_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n if isinstance(plan_context, IStepContext):\n return DagsterEvent.from_step(\n DagsterEventType.ENGINE_EVENT,\n step_context=plan_context,\n event_specific_data=event_specific_data,\n message=message,\n )\n else:\n return DagsterEvent.from_job(\n DagsterEventType.ENGINE_EVENT,\n plan_context,\n message,\n event_specific_data=event_specific_data,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n object_store_name = (\n f"{object_store_operation_result.object_store_name} "\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n f" using {object_store_operation_result.serialization_strategy_name}"\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n f"Stored intermediate object for output {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n f"Retrieved intermediate object for input {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata={"key": MetadataValue.path(object_store_operation_result.key)},\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step "{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Finished the execution of hook "{hook_def.name}" triggered for'\n f' "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Skipped the execution of hook "{hook_def.name}". It did not meet its triggering '\n f'condition during the execution of "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def legacy_compute_log_step_event(step_context: StepExecutionContext):\n step_key = step_context.step.key\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n step_context,\n message=f"Started capturing logs for step: {step_key}.",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=[step_key],\n file_key=step_key,\n ),\n )\n\n @staticmethod\n def capture_logs(\n job_context: IPlanContext,\n step_keys: Sequence[str],\n log_key: Sequence[str],\n log_context: CapturedLogContext,\n ):\n file_key = log_key[-1]\n return DagsterEvent.from_job(\n DagsterEventType.LOGS_CAPTURED,\n job_context,\n message=f"Started capturing logs in process (pid: {os.getpid()}).",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n file_key=file_key,\n external_stdout_url=log_context.external_stdout_url,\n external_stderr_url=log_context.external_stderr_url,\n external_url=log_context.external_url,\n ),\n )
\n\n\ndef get_step_output_event(\n events: Sequence[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.sequence_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", AssetMaterialization),\n ("asset_lineage", Sequence[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: AssetMaterialization,\n asset_lineage: Optional[Sequence[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", AssetMaterialization\n ),\n asset_lineage=check.opt_sequence_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple(\n "_AssetMaterializationPlannedData",\n [("asset_key", AssetKey), ("partition", Optional[str])],\n )\n):\n def __new__(cls, asset_key: AssetKey, partition: Optional[str] = None):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata", Mapping[str, MetadataValue]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def interrupted(steps_interrupted: Sequence[str]) -> "EngineEventData":\n return EngineEventData(\n metadata={"steps_interrupted": MetadataValue.text(str(steps_interrupted))}\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata={}, error=error)\n\n\n@whitelist_for_serdes(storage_name="PipelineFailureData")\nclass JobFailureData(\n NamedTuple(\n "_JobFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(storage_name="PipelineCanceledData")\nclass JobCanceledData(\n NamedTuple(\n "_JobCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(storage_field_names={"file_key": "log_key"})\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("file_key", str), # renamed log_key => file_key to avoid confusion\n ("step_keys", Sequence[str]),\n ("external_url", Optional[str]),\n ("external_stdout_url", Optional[str]),\n ("external_stderr_url", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n file_key: str,\n step_keys: Sequence[str],\n external_url: Optional[str] = None,\n external_stdout_url: Optional[str] = None,\n external_stderr_url: Optional[str] = None,\n ):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n file_key=check.str_param(file_key, "file_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n external_url=check.opt_str_param(external_url, "external_url"),\n external_stdout_url=check.opt_str_param(external_stdout_url, "external_stdout_url"),\n external_stderr_url=check.opt_str_param(external_stderr_url, "external_stderr_url"),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n\n# Old data structures referenced below\n# class AssetStoreOperationData(NamedTuple):\n# op: str\n# step_key: str\n# output_name: str\n# asset_store_key: str\n#\n#\n# class AssetStoreOperationType(Enum):\n# SET_ASSET = "SET_ASSET"\n# GET_ASSET = "GET_ASSET"\n#\n#\n# class PipelineInitFailureData(NamedTuple):\n# error: SerializableErrorInfo\n\n\ndef _handle_back_compat(\n event_type_value: str,\n event_specific_data: Optional[Dict[str, Any]],\n) -> Tuple[str, Optional[Dict[str, Any]]]:\n # transform old specific process events in to engine events\n if event_type_value in [\n "PIPELINE_PROCESS_START",\n "PIPELINE_PROCESS_STARTED",\n "PIPELINE_PROCESS_EXITED",\n ]:\n return "ENGINE_EVENT", {"__class__": "EngineEventData"}\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n assert (\n event_specific_data is not None\n ), "ASSET_STORE_OPERATION event must have specific data"\n if event_specific_data["op"] in (\n "GET_ASSET",\n '{"__enum__": "AssetStoreOperationType.GET_ASSET"}',\n ):\n return (\n "LOADED_INPUT",\n {\n "__class__": "LoadedInputData",\n "input_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n if event_specific_data["op"] in (\n "SET_ASSET",\n '{"__enum__": "AssetStoreOperationType.SET_ASSET"}',\n ):\n return (\n "HANDLED_OUTPUT",\n {\n "__class__": "HandledOutputData",\n "output_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n assert event_specific_data is not None, "STEP_MATERIALIZATION event must have specific data"\n return "ASSET_MATERIALIZATION", event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n assert (\n event_specific_data is not None\n ), "PIPELINE_INIT_FAILURE event must have specific data"\n return "PIPELINE_FAILURE", {\n "__class__": "PipelineFailureData",\n "error": event_specific_data.get("error"),\n }\n\n return event_type_value, event_specific_data\n
", "current_page_name": "_modules/dagster/_core/events", "customsidebar": null, "favicon_url": null, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events.log

\nfrom typing import Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.events import AssetMaterialization, AssetObservation\nfrom dagster._core.events import DagsterEvent, DagsterEventType\nfrom dagster._core.utils import coerce_valid_log_level\nfrom dagster._serdes.serdes import (\n    deserialize_value,\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.error import SerializableErrorInfo\nfrom dagster._utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\n
[docs]@whitelist_for_serdes(\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n old_storage_names={"DagsterEventRecord", "LogMessageRecord", "EventRecord"},\n old_fields={"message": ""},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", PublicAttr[Optional[SerializableErrorInfo]]),\n ("level", PublicAttr[Union[str, int]]),\n ("user_message", PublicAttr[str]),\n ("run_id", PublicAttr[str]),\n ("timestamp", PublicAttr[float]),\n ("step_key", PublicAttr[Optional[str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("dagster_event", PublicAttr[Optional[DagsterEvent]]),\n ],\n )\n):\n """Entries in the event log.\n\n Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n job_name=None,\n dagster_event=None,\n ):\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(job_name, "job_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @public\n @property\n def is_dagster_event(self) -> bool:\n """bool: If this entry contains a DagsterEvent."""\n return bool(self.dagster_event)\n\n
[docs] @public\n def get_dagster_event(self) -> DagsterEvent:\n """DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\n contain a DagsterEvent, an error will be raised.\n """\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event
\n\n def to_json(self):\n return serialize_value(self)\n\n @staticmethod\n def from_json(json_str: str):\n return deserialize_value(json_str, EventLogEntry)\n\n @public\n @property\n def dagster_event_type(self) -> Optional[DagsterEventType]:\n """Optional[DagsterEventType]: The type of the DagsterEvent contained by this entry, if any."""\n return self.dagster_event.event_type if self.dagster_event else None\n\n @public\n @property\n def message(self) -> str:\n """Return the message from the structured DagsterEvent if present, fallback to user_message."""\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION\n ):\n materialization = self.dagster_event.step_materialization_data.materialization\n if isinstance(materialization, AssetMaterialization):\n return materialization\n\n return None\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_OBSERVATION\n ):\n observation = self.dagster_event.asset_observation_data.asset_observation\n if isinstance(observation, AssetObservation):\n return observation\n\n return None\n\n @property\n def tags(self) -> Optional[Mapping[str, str]]:\n materialization = self.asset_materialization\n if materialization:\n return materialization.tags\n\n observation = self.asset_observation\n if observation:\n return observation.tags\n\n return None
\n\n\ndef construct_event_record(logger_message: StructuredLoggerMessage) -> EventLogEntry:\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("job_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """Callback receives a stream of event_records. Piggybacks on the logging machinery."""\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json."""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/events/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.events"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events.log"}, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events"}, "execution": {"api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions import IJob, JobDefinition\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.repository_definition import RepositoryLoadData\nfrom dagster._core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance, InstanceRef\nfrom dagster._core.selector import parse_step_selection\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.telemetry import log_dagster_event, log_repo_stats, telemetry_wrapper\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.interrupts import capture_interrupts\nfrom dagster._utils.merger import merge_dicts\n\nfrom .context_creation_job import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_job_context,\n)\nfrom .job_execution_result import JobExecutionResult\n\nif TYPE_CHECKING:\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new DagsterRun  |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_job                 | ReconstructableJob | sync  | yes         | yes                     |\n# | execute_run_iterator        | DagsterRun         | async | (1)         | no                      |\n# | execute_run                 | DagsterRun         | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the DagsterRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a resolved_op_selection or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                dagster_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if dagster_run.status not in (DagsterRunStatus.NOT_STARTED, DagsterRunStatus.STARTING):\n            if dagster_run.is_finished:\n\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a run worker that started after the run had already finished.",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            elif instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than"\n                        " the run monitor daemon",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n\n                def gen_fail_restarted_run_worker():\n                    yield instance.report_engine_event(\n                        f"{dagster_run.job_name} ({dagster_run.run_id}) started a new"\n                        f" run worker while the run was already in state {dagster_run.status}."\n                        " This most frequently happens when the run worker unexpectedly stops"\n                        " and is restarted by the cluster. Marking the run as failed.",\n                        dagster_run,\n                    )\n                    yield instance.report_run_failed(dagster_run)\n\n                return gen_fail_restarted_run_worker()\n\n    else:\n        check.invariant(\n            dagster_run.status == DagsterRunStatus.STARTED\n            or dagster_run.status == DagsterRunStatus.STARTING,\n            desc=(\n                "Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n                "resuming from a run worker failure".format(\n                    dagster_run.job_name, dagster_run.run_id, dagster_run.status\n                )\n            ),\n        )\n\n    if (\n        dagster_run.resolved_op_selection\n        or dagster_run.asset_selection\n        or dagster_run.asset_check_selection\n    ):\n        # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n            asset_check_selection=dagster_run.asset_check_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=job_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                job=job,\n                execution_plan=execution_plan,\n                dagster_run=dagster_run,\n                instance=instance,\n                run_config=dagster_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> JobExecutionResult:\n    """Executes an existing job run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        job (IJob): The pipeline to execute.\n        dagster_run (DagsterRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        JobExecutionResult: The result of the execution.\n    """\n    if isinstance(job, JobDefinition):\n        raise DagsterInvariantViolationError(\n            "execute_run requires a reconstructable job but received job definition directly"\n            " instead. To support hand-off to other processes please wrap your definition in a call"\n            " to reconstructable(). Learn more about reconstructable here:"\n            " https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            dagster_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        dagster_run.status == DagsterRunStatus.NOT_STARTED\n        or dagster_run.status == DagsterRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            dagster_run.job_name, dagster_run.run_id, dagster_run.status\n        ),\n    )\n    if (\n        dagster_run.resolved_op_selection\n        or dagster_run.asset_selection\n        or dagster_run.asset_check_selection\n    ):\n        # when `execute_run` is directly called, the sub job hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n            asset_check_selection=dagster_run.asset_check_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=job_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            job=job,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            instance=instance,\n            run_config=dagster_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    # We need to reload the run object after execution for it to be accurate\n    reloaded_dagster_run = check.not_none(instance.get_run_by_id(dagster_run.run_id))\n\n    return JobExecutionResult(\n        job.get_definition(),\n        scoped_job_context(\n            execution_plan,\n            job,\n            reloaded_dagster_run.run_config,\n            reloaded_dagster_run,\n            instance,\n        ),\n        event_list,\n        reloaded_dagster_run,\n    )\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n    instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n    if instance:\n        yield instance\n    else:\n        with DagsterInstance.ephemeral() as ephemeral_instance:\n            yield ephemeral_instance\n\n\n
[docs]class ReexecutionOptions(NamedTuple):\n """Reexecution options for python-based execution in Dagster.\n\n Args:\n parent_run_id (str): The run_id of the run to reexecute.\n step_selection (Sequence[str]):\n The list of step selections to reexecute. Must be a subset or match of the\n set of steps executed in the original run. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n """\n\n parent_run_id: str\n step_selection: Sequence[str] = []\n\n @staticmethod\n def from_failure(run_id: str, instance: DagsterInstance) -> "ReexecutionOptions":\n """Creates reexecution options from a failed run.\n\n Args:\n run_id (str): The run_id of the failed run. Run must fail in order to be reexecuted.\n instance (DagsterInstance): The DagsterInstance that the original run occurred in.\n\n Returns:\n ReexecutionOptions: Reexecution options to pass to a python execution.\n """\n from dagster._core.execution.plan.state import KnownExecutionState\n\n parent_run = check.not_none(instance.get_run_by_id(run_id))\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n # Tried to thread through KnownExecutionState to execution plan creation, but little benefit.\n # It is recalculated later by the re-execution machinery.\n step_keys_to_execute, _ = KnownExecutionState.build_resume_retry_reexecution(\n instance, parent_run=cast(DagsterRun, instance.get_run_by_id(run_id))\n )\n return ReexecutionOptions(parent_run_id=run_id, step_selection=step_keys_to_execute)
\n\n\n
[docs]def execute_job(\n job: ReconstructableJob,\n instance: "DagsterInstance",\n run_config: Any = None,\n tags: Optional[Mapping[str, Any]] = None,\n raise_on_error: bool = False,\n op_selection: Optional[Sequence[str]] = None,\n reexecution_options: Optional[ReexecutionOptions] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n """Execute a job synchronously.\n\n This API represents dagster's python entrypoint for out-of-process\n execution. For most testing purposes, :py:meth:`~dagster.JobDefinition.\n execute_in_process` will be more suitable, but when wanting to run\n execution using an out-of-process executor (such as :py:class:`dagster.\n multiprocess_executor`), then `execute_job` is suitable.\n\n `execute_job` expects a persistent :py:class:`DagsterInstance` for\n execution, meaning the `$DAGSTER_HOME` environment variable must be set.\n It also expects a reconstructable pointer to a :py:class:`JobDefinition` so\n that it can be reconstructed in separate processes. This can be done by\n wrapping the ``JobDefinition`` in a call to :py:func:`dagster.\n reconstructable`.\n\n .. code-block:: python\n\n from dagster import DagsterInstance, execute_job, job, reconstructable\n\n @job\n def the_job():\n ...\n\n instance = DagsterInstance.get()\n result = execute_job(reconstructable(the_job), instance=instance)\n assert result.success\n\n\n If using the :py:meth:`~dagster.GraphDefinition.to_job` method to\n construct the ``JobDefinition``, then the invocation must be wrapped in a\n module-scope function, which can be passed to ``reconstructable``.\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def the_graph():\n ...\n\n def define_job():\n return the_graph.to_job(...)\n\n result = execute_job(reconstructable(define_job), ...)\n\n Since `execute_job` is potentially executing outside of the current\n process, output objects need to be retrieved by use of the provided job's\n io managers. Output objects can be retrieved by opening the result of\n `execute_job` as a context manager.\n\n .. code-block:: python\n\n from dagster import execute_job\n\n with execute_job(...) as result:\n output_obj = result.output_for_node("some_op")\n\n ``execute_job`` can also be used to reexecute a run, by providing a :py:class:`ReexecutionOptions` object.\n\n .. code-block:: python\n\n from dagster import ReexecutionOptions, execute_job\n\n instance = DagsterInstance.get()\n\n options = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\n execute_job(reconstructable(job), instance, reexecution_options=options)\n\n Parameters:\n job (ReconstructableJob): A reconstructable pointer to a :py:class:`JobDefinition`.\n instance (DagsterInstance): The instance to execute against.\n run_config (Optional[dict]): The configuration that parametrizes this run, as a dict.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to run logs.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``False``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single\n op names) to execute. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n reexecution_options (Optional[ReexecutionOptions]):\n Reexecution options to provide to the run, if this run is\n intended to be a reexecution of a previous run. Cannot be used in\n tandem with the ``op_selection`` argument.\n\n Returns:\n :py:class:`JobExecutionResult`: The result of job execution.\n """\n check.inst_param(job, "job", ReconstructableJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.opt_sequence_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # get the repository load data here because we call job.get_definition() later in this fn\n job_def, _ = _job_with_repository_load_data(job)\n\n if reexecution_options is not None and op_selection is not None:\n raise DagsterInvariantViolationError(\n "re-execution and op selection cannot be used together at this time."\n )\n\n if reexecution_options:\n if run_config is None:\n run = check.not_none(instance.get_run_by_id(reexecution_options.parent_run_id))\n run_config = run.run_config\n return _reexecute_job(\n job_arg=job_def,\n parent_run_id=reexecution_options.parent_run_id,\n run_config=run_config,\n step_selection=list(reexecution_options.step_selection),\n tags=tags,\n instance=instance,\n raise_on_error=raise_on_error,\n )\n else:\n return _logged_execute_job(\n job_arg=job_def,\n instance=instance,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n raise_on_error=raise_on_error,\n asset_selection=asset_selection,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_job(\n job_arg: Union[IJob, JobDefinition],\n instance: DagsterInstance,\n run_config: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n raise_on_error: bool = True,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (\n job_arg,\n run_config,\n tags,\n resolved_op_selection,\n op_selection,\n ) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n )\n\n log_repo_stats(instance=instance, job=job_arg, source="execute_pipeline")\n\n dagster_run = instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n tags=tags,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\ndef _reexecute_job(\n job_arg: Union[IJob, JobDefinition],\n parent_run_id: str,\n run_config: Optional[Mapping[str, object]] = None,\n step_selection: Optional[Sequence[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> JobExecutionResult:\n """Reexecute an existing job run."""\n check.opt_sequence_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (job_arg, run_config, tags, _, _) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n )\n\n parent_dagster_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_dagster_run is None:\n check.failed(\n f"No parent run with id {parent_run_id} found in instance.",\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n job_arg,\n run_config,\n cast(DagsterRun, parent_dagster_run),\n step_selection,\n )\n\n if parent_dagster_run.asset_selection:\n job_arg = job_arg.get_subset(\n op_selection=None, asset_selection=parent_dagster_run.asset_selection\n )\n\n dagster_run = execute_instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n tags=tags,\n op_selection=parent_dagster_run.op_selection,\n asset_selection=parent_dagster_run.asset_selection,\n resolved_op_selection=parent_dagster_run.resolved_op_selection,\n root_run_id=parent_dagster_run.root_run_id or parent_dagster_run.run_id,\n parent_run_id=parent_dagster_run.run_id,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )\n check.failed("Should not reach here.")\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[Mapping[str, object]] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n if isinstance(job, ReconstructableJob):\n job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n job=job,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n job: IJob,\n instance: DagsterInstance,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> Sequence[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_job() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n run_config = check.opt_mapping_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n job=job,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _get_execution_plan_from_run(\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n) -> ExecutionPlan:\n execution_plan_snapshot = (\n instance.get_execution_plan_snapshot(dagster_run.execution_plan_snapshot_id)\n if dagster_run.execution_plan_snapshot_id\n else None\n )\n\n # Rebuild from snapshot if able and selection has not changed\n if (\n execution_plan_snapshot is not None\n and execution_plan_snapshot.can_reconstruct_plan\n and job.resolved_op_selection == dagster_run.resolved_op_selection\n and job.asset_selection == dagster_run.asset_selection\n and job.asset_check_selection == dagster_run.asset_check_selection\n ):\n return ExecutionPlan.rebuild_from_snapshot(\n dagster_run.job_name,\n execution_plan_snapshot,\n )\n\n return create_execution_plan(\n job,\n run_config=dagster_run.run_config,\n step_keys_to_execute=dagster_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n repository_load_data=(\n execution_plan_snapshot.repository_load_data if execution_plan_snapshot else None\n ),\n known_state=(\n execution_plan_snapshot.initial_known_state if execution_plan_snapshot else None\n ),\n )\n\n\ndef create_execution_plan(\n job: Union[IJob, JobDefinition],\n run_config: Optional[Mapping[str, object]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Mapping[str, str]] = None,\n repository_load_data: Optional[RepositoryLoadData] = None,\n) -> ExecutionPlan:\n if isinstance(job, IJob):\n # If you have repository_load_data, make sure to use it when building plan\n if isinstance(job, ReconstructableJob) and repository_load_data is not None:\n job = job.with_repository_load_data(repository_load_data)\n job_def = job.get_definition()\n else:\n job_def = job\n\n run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n known_state = check.opt_inst_param(\n known_state,\n "known_state",\n KnownExecutionState,\n default=KnownExecutionState(),\n )\n repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n resolved_run_config = ResolvedRunConfig.build(job_def, run_config)\n\n return ExecutionPlan.build(\n job_def,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n\ndef job_execution_iterator(\n job_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n # TODO: restart event?\n if not job_context.resume_from_failure:\n yield DagsterEvent.job_start(job_context)\n\n job_exception_info = None\n job_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in job_context.executor.execute(job_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n elif event.is_resource_init_failure and event.step_key:\n failed_steps.append(event.step_key)\n\n # Telemetry\n log_dagster_event(event, job_context)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n job_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except BaseException:\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if job_canceled_info:\n reloaded_run = job_context.instance.get_run_by_id(job_context.run_id)\n if reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELING:\n event = DagsterEvent.job_canceled(job_context, job_canceled_info)\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n job_context,\n "Computational resources were cleaned up after the run was forcibly marked"\n " as canceled.",\n EngineEventData(),\n )\n elif job_context.instance.run_will_resume(job_context.run_id):\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted unexpectedly. No user initiated termination"\n " request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.FAILURE:\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted for a run that was already in a failure state.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.job_failure(\n job_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n job_canceled_info,\n )\n elif job_exception_info:\n event = DagsterEvent.job_failure(\n job_context,\n "An exception was thrown during execution.",\n job_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.job_failure(\n job_context,\n f"Steps failed: {failed_steps}.",\n )\n else:\n event = DagsterEvent.job_success(job_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `JobExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster._utils.EventGenerationManager`.\n """\n\n def __init__(\n self,\n execution_plan: ExecutionPlan,\n iterator: Callable[..., Iterator[DagsterEvent]],\n execution_context_manager: ExecutionContextManager[Any],\n ):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.job_context = None\n\n def __iter__(self) -> Iterator[DagsterEvent]:\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.job_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.job_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n job_context=self.job_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_job_args(\n job_arg: Union[JobDefinition, IJob],\n run_config: Optional[Mapping[str, object]],\n tags: Optional[Mapping[str, str]],\n op_selection: Optional[Sequence[str]] = None,\n) -> Tuple[\n IJob,\n Optional[Mapping],\n Mapping[str, str],\n Optional[AbstractSet[str]],\n Optional[Sequence[str]],\n]:\n ijob = InMemoryJob(job_arg) if isinstance(job_arg, JobDefinition) else job_arg\n job_def = job_arg if isinstance(job_arg, JobDefinition) else job_arg.get_definition()\n\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n\n tags = merge_dicts(job_def.tags, tags)\n\n # generate job subset from the given op_selection\n if op_selection:\n ijob = ijob.get_subset(op_selection=op_selection)\n\n return (\n ijob,\n run_config,\n tags,\n ijob.resolved_op_selection,\n op_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n job: IJob,\n run_config: Optional[Mapping],\n parent_dagster_run: DagsterRun,\n step_selection: Sequence[str],\n) -> ExecutionPlan:\n if parent_dagster_run.op_selection:\n job = job.get_subset(op_selection=parent_dagster_run.op_selection)\n\n state = KnownExecutionState.build_for_reexecution(instance, parent_dagster_run)\n\n parent_plan = create_execution_plan(\n job,\n parent_dagster_run.run_config,\n known_state=state,\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n job,\n run_config,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=state.update_for_step_selection(step_keys_to_execute),\n tags=parent_dagster_run.tags,\n )\n return execution_plan\n\n\ndef _job_with_repository_load_data(\n job_arg: Union[JobDefinition, IJob],\n) -> Tuple[Union[JobDefinition, IJob], Optional[RepositoryLoadData]]:\n """For ReconstructableJob, generate and return any required RepositoryLoadData, alongside\n a ReconstructableJob with this repository load data baked in.\n """\n if isinstance(job_arg, ReconstructableJob):\n # Unless this ReconstructableJob alread has repository_load_data attached, this will\n # force the repository_load_data to be computed from scratch.\n repository_load_data = job_arg.repository.get_definition().repository_load_data\n return job_arg.with_repository_load_data(repository_load_data), repository_load_data\n return job_arg, None\n
", "current_page_name": "_modules/dagster/_core/execution/api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.api"}, "build_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Mapping, Optional, cast\n\nimport dagster._check as check\nfrom dagster._config import process_config\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.resources_init import resource_initialization_manager\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_job import initialize_console_manager\n\n\ndef get_mapped_resource_config(\n    resource_defs: Mapping[str, ResourceDefinition], resource_config: Mapping[str, Any]\n) -> Mapping[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Mapping[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, dagster_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Mapping[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Mapping[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n dagster_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `dagster_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n resources = check.mapping_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(dagster_run),\n execution_plan=None,\n dagster_run=dagster_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Mapping[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n return (\n {\n resource_key: wrap_resource_for_execution(resource)\n for resource_key, resource in resources.items()\n }\n if resources\n else {}\n )\n\n\ndef wrap_resource_for_execution(resource: Any) -> ResourceDefinition:\n from dagster._config.pythonic_config import ConfigurableResourceFactory, PartialResource\n\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n if isinstance(resource, (ConfigurableResourceFactory, PartialResource)):\n return resource.get_resource_definition()\n elif isinstance(resource, ResourceDefinition):\n return resource\n elif isinstance(resource, IOManager):\n return IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n return ResourceDefinition.hardcoded_resource(resource)\n
", "current_page_name": "_modules/dagster/_core/execution/build_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.compute

\nfrom abc import ABC, ABCMeta, abstractmethod\nfrom inspect import _empty as EmptyAnnotation\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.data_version import (\n    DataProvenance,\n    DataVersion,\n    extract_data_provenance_from_entry,\n)\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n)\n\nfrom .system import StepExecutionContext\n\n\n# This metaclass has to exist for OpExecutionContext to have a metaclass\nclass AbstractComputeMetaclass(ABCMeta):\n    pass\n\n\nclass AbstractComputeExecutionContext(ABC, metaclass=AbstractComputeMetaclass):\n    """Base class for op context implemented by OpExecutionContext and DagstermillExecutionContext."""\n\n    @abstractmethod\n    def has_tag(self, key: str) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def op_def(self) -> OpDefinition:\n        """The op definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def job_def(self) -> JobDefinition:\n        """The job being executed."""\n\n    @property\n    @abstractmethod\n    def run(self) -> DagsterRun:\n        """The DagsterRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def op_config(self) -> Any:\n        """The parsed config specific to this op."""\n\n\nclass OpExecutionContextMetaClass(AbstractComputeMetaclass):\n    def __instancecheck__(cls, instance) -> bool:\n        # This makes isinstance(context, OpExecutionContext) throw a deprecation warning when\n        # context is an AssetExecutionContext. This metaclass can be deleted once AssetExecutionContext\n        # has been split into it's own class in 1.7.0\n        if type(instance) is AssetExecutionContext and cls is not AssetExecutionContext:\n            deprecation_warning(\n                subject="AssetExecutionContext",\n                additional_warn_text=(\n                    "Starting in version 1.7.0 AssetExecutionContext will no longer be a subclass"\n                    " of OpExecutionContext."\n                ),\n                breaking_version="1.7.0",\n                stacklevel=1,\n            )\n        return super().__instancecheck__(instance)\n\n\n
[docs]class OpExecutionContext(AbstractComputeExecutionContext, metaclass=OpExecutionContextMetaClass):\n """The ``context`` object that can be made available as the first argument to the function\n used for computing an op or asset.\n\n This context object provides system information such as resources, config, and logging.\n\n To construct an execution context for testing purposes, use :py:func:`dagster.build_op_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import op, OpExecutionContext\n\n @op\n def hello_world(context: OpExecutionContext):\n context.log.info("Hello, world!")\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @public\n @property\n def op_config(self) -> Any:\n """Any: The parsed config specific to this op."""\n return self._step_execution_context.op_config\n\n @property\n def dagster_run(self) -> DagsterRun:\n """PipelineRun: The current pipeline run."""\n return self._step_execution_context.dagster_run\n\n @property\n def run(self) -> DagsterRun:\n """DagsterRun: The current run."""\n return self.dagster_run\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance."""\n return self._step_execution_context.instance\n\n @public\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in"\n " 0.10.0. Please access it via `context.resources.file_manager` instead."\n )\n\n @public\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, object]:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The currently executing pipeline."""\n return self._step_execution_context.job_def\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def node_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self._step_execution_context.node_handle\n\n @property\n def op_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self.node_handle\n\n @property\n def op(self) -> Node:\n """Node: The object representing the invoked op within the graph.\n\n :meta private:\n\n """\n return self._step_execution_context.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """OpDefinition: The current op definition."""\n return cast(OpDefinition, self.op.definition)\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._step_execution_context.has_partition_key\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run. Or if the current run is operating\n over a range of partitions (ie. a backfill of several partitions executed in a single run).\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n """\n return self._step_execution_context.partition_key\n\n @deprecated(breaking_version="2.0", additional_warn_text="Use `partition_key_range` instead.")\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n """\n return self.partition_key_range\n\n @public\n @property\n def partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, returns a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key_range)\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n """\n return self._step_execution_context.asset_partition_key_range\n\n @public\n @property\n def partition_time_window(self) -> TimeWindow:\n """The partition time window for the current run.\n\n Raises an error if the current run is not a partitioned run, or if the job's partition\n definition is not a TimeWindowPartitionsDefinition.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_time_window)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n """\n return self._step_execution_context.partition_time_window\n\n
[docs] @public\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] @public\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n @property\n def run_tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for the current run."""\n return self._step_execution_context.run_tags\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n
[docs] @public\n def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n if isinstance(event, AssetMaterialization):\n self._events.append(\n DagsterEvent.asset_materialization(self._step_execution_context, event)\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed(f"Unexpected event {event}")
\n\n
[docs] @public\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can be invoked multiple times per output in the body of an op. If the same key is\n passed multiple times, the value associated with the last call will be used.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n mapping_key (Optional[str]): The mapping key of the output to attach metadata to. If the\n output is not dynamic, this argument does not need to be provided.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @public\n @property\n def retry_number(self) -> int:\n """Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc."""\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] @public\n def get_mapping_key(self) -> Optional[str]:\n """Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None."""\n return self._step_execution_context.step.get_mapping_key()
\n\n #############################################################################################\n # asset related methods\n #############################################################################################\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead."""\n if self.has_assets_def and len(self.assets_def.keys_by_output_name.keys()) > 1:\n raise DagsterInvariantViolationError(\n "Cannot call `context.asset_key` in a multi_asset with more than one asset. Use"\n " `context.asset_key_for_output` instead."\n )\n return next(iter(self.assets_def.keys_by_output_name.values()))\n\n @public\n @property\n def has_assets_def(self) -> bool:\n """If there is a backing AssetsDefinition for what is currently executing."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n return assets_def is not None\n\n @public\n @property\n def assets_def(self) -> AssetsDefinition:\n """The backing AssetsDefinition for what is currently executing, errors if not available."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n if assets_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an assets definition."\n )\n return assets_def\n\n @public\n @property\n def selected_asset_keys(self) -> AbstractSet[AssetKey]:\n """Get the set of AssetKeys this execution is expected to materialize."""\n if not self.has_assets_def:\n return set()\n return self.assets_def.keys\n\n @public\n @property\n def has_asset_checks_def(self) -> bool:\n """Return a boolean indicating the presence of a backing AssetChecksDefinition\n for the current execution.\n\n Returns:\n bool: True if there is a backing AssetChecksDefinition for the current execution, otherwise False.\n """\n return self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle) is not None\n\n @public\n @property\n def asset_checks_def(self) -> AssetChecksDefinition:\n """The backing AssetChecksDefinition for what is currently executing, errors if not\n available.\n\n Returns:\n AssetChecksDefinition.\n """\n asset_checks_def = self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle)\n if asset_checks_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an asset checks definition."\n )\n\n return asset_checks_def\n\n @public\n @property\n def selected_asset_check_keys(self) -> AbstractSet[AssetCheckKey]:\n if self.has_assets_def:\n return self.assets_def.check_keys\n\n if self.has_asset_checks_def:\n check.failed("Subset selection is not yet supported within an AssetChecksDefinition")\n\n return set()\n\n @public\n @property\n def selected_output_names(self) -> AbstractSet[str]:\n """Get the output names that correspond to the current selection of assets this execution is expected to materialize."""\n # map selected asset keys to the output names they correspond to\n selected_asset_keys = self.selected_asset_keys\n selected_outputs: Set[str] = set()\n for output_name in self.op.output_dict.keys():\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output_name\n )\n if any( # For graph-backed assets, check if a downstream asset is selected\n [\n asset_key in selected_asset_keys\n for asset_key in self.job_def.asset_layer.downstream_dep_assets(\n self.node_handle, output_name\n )\n ]\n ) or (asset_info and asset_info.key in selected_asset_keys):\n selected_outputs.add(output_name)\n\n return selected_outputs\n\n
[docs] @public\n def asset_key_for_output(self, output_name: str = "result") -> AssetKey:\n """Return the AssetKey for the corresponding output."""\n asset_output_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.op_handle, output_name=output_name\n )\n if asset_output_info is None:\n check.failed(f"Output '{output_name}' has no asset")\n else:\n return asset_output_info.key
\n\n
[docs] @public\n def output_for_asset_key(self, asset_key: AssetKey) -> str:\n """Return the output name for the corresponding asset key."""\n node_output_handle = self.job_def.asset_layer.node_output_handle_for_asset(asset_key)\n if node_output_handle is None:\n check.failed(f"Asset key '{asset_key}' has no output")\n else:\n return node_output_handle.output_name
\n\n
[docs] @public\n def asset_key_for_input(self, input_name: str) -> AssetKey:\n """Return the AssetKey for the corresponding input."""\n key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.op_handle, input_name=input_name\n )\n if key is None:\n check.failed(f"Input '{input_name}' has no asset")\n else:\n return key
\n\n
[docs] @public\n def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output("first_asset"))\n context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] @public\n def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_output`` to get the TimeWindow of all of the partitions\n being materialized by the backfill.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_output(\n self, output_name: str = "result"\n ) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key range for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n """\n return self._step_execution_context.asset_partition_key_range_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\n non-contiguous chunk of the input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_input`` to get the range of partitions keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n """\n return self._step_execution_context.asset_partition_key_range_for_input(input_name)
\n\n
[docs] @public\n def asset_partition_key_for_input(self, input_name: str) -> str:\n """Returns the partition key of the upstream asset corresponding to the given input.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-20"\n\n """\n return self._step_execution_context.asset_partition_key_for_input(input_name)
\n\n
[docs] @public\n def asset_partitions_def_for_output(self, output_name: str = "result") -> PartitionsDefinition:\n """The PartitionsDefinition on the asset corresponding to this output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output("first_asset"))\n context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_output(output_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partitions_def_for_input(self, input_name: str) -> PartitionsDefinition:\n """The PartitionsDefinition on the upstream asset corresponding to this input.\n\n Args:\n input_name (str): The name of the input to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_input(input_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partition_keys_for_output(self, output_name: str = "result") -> Sequence[str]:\n """Returns a list of the partition keys for the given output.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition keys for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output("first_asset"))\n context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n """\n return self.asset_partitions_def_for_output(output_name).get_partition_keys_in_range(\n self._step_execution_context.asset_partition_key_range_for_output(output_name),\n dynamic_partitions_store=self.instance,\n )
\n\n
[docs] @public\n def asset_partition_keys_for_input(self, input_name: str) -> Sequence[str]:\n """Returns a list of the partition keys of the upstream asset corresponding to the\n given input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_input`` to get all of the partition keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n """\n return list(\n self._step_execution_context.asset_partitions_subset_for_input(\n input_name\n ).get_partition_keys()\n )
\n\n
[docs] @public\n def asset_partitions_time_window_for_input(self, input_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_input`` to get the time window of the input that\n are relevant to that backfill.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-25")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_input(input_name)
\n\n
[docs] @public\n @experimental\n def get_asset_provenance(self, asset_key: AssetKey) -> Optional[DataProvenance]:\n """Return the provenance information for the most recent materialization of an asset.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to retrieve provenance.\n\n Returns:\n Optional[DataProvenance]: Provenance information for the most recent\n materialization of the asset. Returns `None` if the asset was never materialized or\n the materialization record is too old to contain provenance information.\n """\n record = self.instance.get_latest_data_version_record(asset_key)\n\n return (\n None if record is None else extract_data_provenance_from_entry(record.event_log_entry)\n )
\n\n def set_data_version(self, asset_key: AssetKey, data_version: DataVersion) -> None:\n """Set the data version for an asset being materialized by the currently executing step.\n This is useful for external execution situations where it is not possible to return\n an `Output`.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to set the data version.\n data_version (DataVersion): The data version to set.\n """\n self._step_execution_context.set_data_version(asset_key, data_version)\n\n @property\n def asset_check_spec(self) -> AssetCheckSpec:\n asset_checks_def = check.not_none(\n self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle),\n "This context does not correspond to an AssetChecksDefinition",\n )\n return asset_checks_def.spec\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._step_execution_context.requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._step_execution_context.typed_event_stream_error_message\n\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None) -> None:\n self._step_execution_context.set_requires_typed_event_stream(error_message=error_message)
\n\n\n
[docs]class AssetExecutionContext(OpExecutionContext):\n def __init__(self, step_execution_context: StepExecutionContext):\n super().__init__(step_execution_context=step_execution_context)
\n\n\ndef build_execution_context(\n step_context: StepExecutionContext,\n) -> Union[OpExecutionContext, AssetExecutionContext]:\n """Get the correct context based on the type of step (op or asset) and the user provided context\n type annotation. Follows these rules.\n\n step type annotation result\n asset AssetExecutionContext AssetExecutionContext\n asset OpExecutionContext OpExecutionContext\n asset None AssetExecutionContext\n op AssetExecutionContext Error - we cannot init an AssetExecutionContext w/o an AssetsDefinition\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n For ops in graph-backed assets\n step type annotation result\n op AssetExecutionContext AssetExecutionContext\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n """\n is_sda_step = step_context.is_sda_step\n is_op_in_graph_asset = is_sda_step and step_context.is_op_in_graph\n context_annotation = EmptyAnnotation\n compute_fn = step_context.op_def._compute_fn # noqa: SLF001\n compute_fn = (\n compute_fn\n if isinstance(compute_fn, DecoratedOpFunction)\n else DecoratedOpFunction(compute_fn)\n )\n if compute_fn.has_context_arg():\n context_param = compute_fn.get_context_arg()\n context_annotation = context_param.annotation\n\n # It would be nice to do this check at definition time, rather than at run time, but we don't\n # know if the op is part of an op job or a graph-backed asset until we have the step execution context\n if context_annotation is AssetExecutionContext and not is_sda_step:\n # AssetExecutionContext requires an AssetsDefinition during init, so an op in an op job\n # cannot be annotated with AssetExecutionContext\n raise DagsterInvalidDefinitionError(\n "Cannot annotate @op `context` parameter with type AssetExecutionContext unless the"\n " op is part of a graph-backed asset. `context` must be annotated with"\n " OpExecutionContext, or left blank."\n )\n\n if context_annotation is EmptyAnnotation:\n # if no type hint has been given, default to:\n # * AssetExecutionContext for sda steps, not in graph-backed assets\n # * OpExecutionContext for non sda steps\n # * OpExecutionContext for ops in graph-backed assets\n if is_op_in_graph_asset or not is_sda_step:\n return OpExecutionContext(step_context)\n return AssetExecutionContext(step_context)\n if context_annotation is AssetExecutionContext:\n return AssetExecutionContext(step_context)\n return OpExecutionContext(step_context)\n
", "current_page_name": "_modules/dagster/_core/execution/context/compute", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.hook

\nimport warnings\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, Mapping, Optional, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom ..plan.utils import RetryRequestedFromPolicy\nfrom .system import StepExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object.\n    """\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent."""\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @public\n @property\n def job_name(self) -> str:\n """The name of the job where this hook is being triggered."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run where this hook is being triggered."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def hook_def(self) -> HookDefinition:\n """The hook that the context object belongs to."""\n return self._hook_def\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """The instance configured to run the current job."""\n return self._step_execution_context.instance\n\n @property\n def op(self) -> Node:\n """The op instance associated with the hook."""\n return self._step_execution_context.op\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @public\n @property\n def step_key(self) -> str:\n """The key for the step where this hook is being triggered."""\n return self._step_execution_context.step.key\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """Resources required by this hook."""\n return self._required_resource_keys\n\n @public\n @property\n def resources(self) -> "Resources":\n """Resources available in the hook context."""\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.ops.get(\n str(self._step_execution_context.step.node_handle)\n )\n return solid_config.config if solid_config else None\n\n @public\n @property\n def op_config(self) -> Any:\n """The parsed config specific to this op."""\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @public\n @property\n def op_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed op."""\n exc = self._step_execution_context.step_exception\n\n if isinstance(exc, RetryRequestedFromPolicy):\n return exc.__cause__\n\n return exc\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @public\n @property\n def op_output_values(self):\n """Computed output values in an op."""\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Mapping[str, Any],\n op: Optional[Union[OpDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n from ..build_resources import build_resources, wrap_resources_for_execution\n from ..context_creation_job import initialize_console_manager\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.nodes[0]\n\n # Open resource context manager\n self._resource_defs = wrap_resources_for_execution(resources)\n self._resources_cm = build_resources(self._resource_defs)\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc: Any):\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\n
[docs]def build_hook_context(\n resources: Optional[Mapping[str, Any]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n instance: Optional["DagsterInstance"] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n instance (Optional[DagsterInstance]): The Dagster instance configured to run the hook.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n\n from dagster._core.instance import DagsterInstance\n\n return UnboundHookContext(\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/hook", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.hook"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.init

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\n\n
[docs]class InitResourceContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.ResourceDefinition`.\n\n Users should not instantiate this object directly. To construct an `InitResourceContext` for testing purposes, use :py:func:`dagster.build_init_resource_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import resource, InitResourceContext\n\n @resource\n def the_resource(init_context: InitResourceContext):\n init_context.log.info("Hello, world!")\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n ):\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n self._dagster_run = dagster_run\n\n @public\n @property\n def resource_config(self) -> Any:\n """The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n """\n return self._resource_config\n\n @public\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n """The definition of the resource currently being constructed."""\n return self._resource_def\n\n @public\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n return self._resources\n\n @public\n @property\n def instance(self) -> Optional[DagsterInstance]:\n """The Dagster instance configured for the current execution context."""\n return self._instance\n\n @property\n def dagster_run(self) -> Optional[DagsterRun]:\n """The dagster run to use. When initializing resources outside of execution context, this will be None."""\n return self._dagster_run\n\n @public\n @property\n def log(self) -> Optional[DagsterLogManager]:\n """The Dagster log manager configured for the current execution context."""\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @public\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n """The log manager for this run of the job."""\n return self._log_manager\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The id for this run of the job or pipeline. When initializing resources outside of\n execution context, this will be None.\n """\n return self.dagster_run.run_id if self.dagster_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n dagster_run=self.dagster_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Mapping[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster._core.execution.api import ephemeral_instance_if_missing\n from dagster._core.execution.build_resources import (\n build_resources,\n wrap_resources_for_execution,\n )\n from dagster._core.execution.context_creation_job import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__()\n\n if isinstance(resources, Resources):\n check.failed("Should not have a Resources object directly from this initialization")\n\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resources, "resources")\n )\n\n self._resources_cm = build_resources(self._resource_defs, instance=instance)\n resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n dagster_run=None,\n log_manager=initialize_console_manager(None),\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc)\n if self._instance_provided:\n self._instance_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None)\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_mapping_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.init"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.input

\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n)\nfrom dagster._core.definitions.partition import PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow, TimeWindowPartitionsSubset\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """The ``context`` object available to the load_input method of :py:class:`InputManager`.\n\n Users should not instantiate this object directly. In order to construct\n an `InputContext` for testing an IO Manager's `load_input` method, use\n :py:func:`dagster.build_input_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, InputContext\n\n class MyIOManager(IOManager):\n def load_input(self, context: InputContext):\n ...\n """\n\n def __init__(\n self,\n *,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n op_def: Optional["OpDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Union["Resources", Mapping[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[AssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partitions_subset: Optional[PartitionsSubset] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._name = name\n self._job_name = job_name\n self._op_def = op_def\n self._config = config\n self._metadata = metadata or {}\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_key = asset_key\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n self._asset_partitions_subset = asset_partitions_subset\n self._asset_partitions_def = asset_partitions_def\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._instance = instance\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def instance(self) -> DagsterInstance:\n if self._instance is None:\n raise DagsterInvariantViolationError(\n "Attempting to access instance, "\n "but it was not provided when constructing the InputContext"\n )\n return self._instance\n\n @public\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name.\n """\n return self._name is not None\n\n @public\n @property\n def name(self) -> str:\n """The name of the input that we're loading."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access job_name, "\n "but it was not provided when constructing the InputContext"\n )\n return self._job_name\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that's loading the input."""\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._op_def\n\n @public\n @property\n def config(self) -> Any:\n """The config attached to the input that we're loading."""\n return self._config\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of metadata that is assigned to the InputDefinition that we're loading for.\n This property only contains metadata passed in explicitly with :py:class:`AssetIn`\n or :py:class:`In`. To access metadata of an upstream asset or operation definition,\n use the metadata in :py:attr:`.InputContext.upstream_output`.\n """\n return self._metadata\n\n @public\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n """Info about the output that produced the object we're loading."""\n return self._upstream_output\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this input.\n Dagster types do not propagate from an upstream output to downstream inputs,\n and this property only captures type information for the input that is either\n passed in explicitly with :py:class:`AssetIn` or :py:class:`In`, or can be\n infered from type hints. For an asset input, the Dagster type from the upstream\n asset definition is ignored.\n """\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this input."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, Any]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the resource that initializes the\n input manager. If using the :py:func:`@input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\n indicates that an output from an op is being loaded as the input.\n """\n return self._asset_key is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being loaded as an input."""\n if self._asset_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, but no asset is associated with this input"\n )\n\n return self._asset_key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the upstream asset corresponding to this input."""\n if self._asset_partitions_def is None:\n if self.asset_key:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {self.asset_key}, but it is not"\n " partitioned"\n )\n else:\n raise DagsterInvariantViolationError(\n "Attempting to access partitions def for asset, but input does not correspond"\n " to an asset"\n )\n\n return self._asset_partitions_def\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being loaded as input is partitioned."""\n return self._asset_partitions_subset is not None\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed("The input does not correspond to a partitioned asset.")\n\n partition_keys = list(subset.get_partition_keys())\n if len(partition_keys) == 1:\n return partition_keys[0]\n else:\n check.failed(\n f"Tried to access partition key for asset '{self.asset_key}', "\n f"but the number of input partitions != 1: '{subset}'."\n )\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partition_key_range, but the asset is not partitioned.",\n )\n\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset_partition_key_range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n if self._asset_partitions_subset is None:\n check.failed(\n "Tried to access asset_partition_keys, but the asset is not partitioned.",\n )\n\n return list(self._asset_partitions_subset.get_partition_keys())\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned.",\n )\n\n if not isinstance(subset, TimeWindowPartitionsSubset):\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned"\n " with time windows.",\n )\n\n time_windows = subset.included_time_windows\n if len(time_windows) != 1:\n check.failed(\n "Tried to access asset_partitions_time_window, but there are "\n f"({len(time_windows)}) time windows associated with this input.",\n )\n\n return time_windows[0]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step input.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the input.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the input is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n if self.upstream_output is None:\n raise DagsterInvariantViolationError(\n "InputContext.upstream_output not defined. Cannot compute an identifier"\n )\n\n return self.upstream_output.get_identifier()
\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being loaded as an input.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])``, materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset identifier for an input with no asset key")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def add_input_metadata(\n self,\n metadata: Mapping[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster._core.definitions.metadata import normalize_metadata\n from dagster._core.events import DagsterEvent\n\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n self._metadata = {**self._metadata, **normalize_metadata(metadata)}\n if self.has_asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))\n\n def get_observations(\n self,\n ) -> Sequence[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations\n\n def consume_metadata(self) -> Mapping[str, MetadataValue]:\n result = self._metadata\n self._metadata = {}\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n op_def: Optional["OpDefinition"] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partition_key_range: Optional[PartitionKeyRange] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[Union[AssetKey, Sequence[str], str]]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n partition_key (Optional[str]): String value representing partition key to execute with.\n asset_partition_key_range (Optional[str]): The range of asset partition keys to load.\n asset_partitions_def: Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\n being loaded.\n\n Examples:\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster._core.definitions import OpDefinition, PartitionsDefinition\n from dagster._core.execution.context.output import OutputContext\n from dagster._core.execution.context.system import StepExecutionContext\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n asset_partition_key_range = check.opt_inst_param(\n asset_partition_key_range, "asset_partition_key_range", PartitionKeyRange\n )\n asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partitions_def", PartitionsDefinition\n )\n if asset_partitions_def and asset_partition_key_range:\n asset_partitions_subset = asset_partitions_def.empty_subset().with_partition_key_range(\n asset_partition_key_range, dynamic_partitions_store=instance\n )\n elif asset_partition_key_range:\n asset_partitions_subset = KeyRangeNoPartitionsDefPartitionsSubset(asset_partition_key_range)\n else:\n asset_partitions_subset = None\n\n return InputContext(\n name=name,\n job_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n asset_key=asset_key,\n partition_key=partition_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=instance,\n )
\n\n\nclass KeyRangeNoPartitionsDefPartitionsSubset(PartitionsSubset):\n """For build_input_context when no PartitionsDefinition has been provided."""\n\n def __init__(self, key_range: PartitionKeyRange):\n self._key_range = key_range\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n raise NotImplementedError()\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._key_range.start == self._key_range.end:\n return self._key_range.start\n else:\n raise NotImplementedError()\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [self._key_range]\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def serialize(self) -> str:\n raise NotImplementedError()\n\n @property\n def partitions_def(self) -> "PartitionsDefinition":\n raise NotImplementedError()\n\n def __len__(self) -> int:\n raise NotImplementedError()\n\n def __contains__(self, value) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def from_serialized(\n cls, partitions_def: "PartitionsDefinition", serialized: str\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: "PartitionsDefinition",\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def empty_subset(cls, partitions_def: "PartitionsDefinition") -> "PartitionsSubset":\n raise NotImplementedError()\n
", "current_page_name": "_modules/dagster/_core/execution/context/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.invocation

\nfrom contextlib import ExitStack\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.composition import PendingNodeInvocation\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.resource_requirement import ensure_requirements_satisfied\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.execution.build_resources import build_resources, wrap_resources_for_execution\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.merger import merge_dicts\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundOpExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        op_config: Any,\n        resources_dict: Mapping[str, Any],\n        resources_config: Mapping[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        mapping_key: Optional[str],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        from dagster._core.execution.api import ephemeral_instance_if_missing\n        from dagster._core.execution.context_creation_job import initialize_console_manager\n\n        self._op_config = op_config\n        self._mapping_key = mapping_key\n\n        self._exit_stack = ExitStack()\n\n        # Construct ephemeral instance if missing\n        self._instance = self._exit_stack.enter_context(ephemeral_instance_if_missing(instance))\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resource_defs = wrap_resources_for_execution(resources_dict)\n        self._resources = self._exit_stack.enter_context(\n            build_resources(\n                resources=self._resource_defs,\n                instance=self._instance,\n                resource_config=resources_config,\n            )\n        )\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        check.invariant(\n            not (partition_key and partition_key_range),\n            "Must supply at most one of partition_key or partition_key_range",\n        )\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n        self._assets_def = check.opt_inst_param(assets_def, "assets_def", AssetsDefinition)\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._exit_stack.close()\n\n    def __del__(self):\n        self._exit_stack.close()\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resource_keys(self) -> AbstractSet[str]:\n        return self._resource_defs.keys()\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_op_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def op(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op_def", "property"))\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("assets_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned run")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self,\n        op_def: OpDefinition,\n        pending_invocation: Optional[PendingNodeInvocation[OpDefinition]],\n        assets_def: Optional[AssetsDefinition],\n        config_from_args: Optional[Mapping[str, Any]],\n        resources_from_args: Optional[Mapping[str, Any]],\n    ) -> "BoundOpExecutionContext":\n        from dagster._core.definitions.resource_invocation import resolve_bound_config\n\n        if resources_from_args:\n            if self._resource_defs:\n                raise DagsterInvalidInvocationError(\n                    "Cannot provide resources in both context and kwargs"\n                )\n            resource_defs = wrap_resources_for_execution(resources_from_args)\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance)\n            )\n        elif assets_def and assets_def.resource_defs:\n            for key in sorted(list(assets_def.resource_defs.keys())):\n                if key in self._resource_defs:\n                    raise DagsterInvalidInvocationError(\n                        f"Error when invoking {assets_def!s} resource '{key}' "\n                        "provided on both the definition and invocation context. Please "\n                        "provide on only one or the other."\n                    )\n            resource_defs = wrap_resources_for_execution(\n                {**self._resource_defs, **assets_def.resource_defs}\n            )\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance, self._resources_config)\n            )\n        else:\n            resources = self.resources\n            resource_defs = self._resource_defs\n\n        _validate_resource_requirements(resource_defs, op_def)\n\n        if self.op_config and config_from_args:\n            raise DagsterInvalidInvocationError("Cannot provide config in both context and kwargs")\n        op_config = resolve_bound_config(config_from_args or self.op_config, op_def)\n\n        return BoundOpExecutionContext(\n            op_def=op_def,\n            op_config=op_config,\n            resources=resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=(\n                pending_invocation.tags\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            hook_defs=(\n                pending_invocation.hook_defs\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            alias=(\n                pending_invocation.given_alias\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n            mapping_key=self._mapping_key,\n            partition_key=self._partition_key,\n            partition_key_range=self._partition_key_range,\n            assets_def=assets_def,\n        )\n\n    def get_events(self) -> Sequence[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n\ndef _validate_resource_requirements(\n    resource_defs: Mapping[str, ResourceDefinition], op_def: OpDefinition\n) -> None:\n    """Validate correctness of resources against required resource keys."""\n    if cast(DecoratedOpFunction, op_def.compute_fn).has_context_arg():\n        for requirement in op_def.get_resource_requirements():\n            if not requirement.is_io_manager_requirement:\n                ensure_requirements_satisfied(resource_defs, [requirement])\n\n\nclass BoundOpExecutionContext(OpExecutionContext):\n    """The op execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific op definition, for which the resources and config have\n    been validated.\n    """\n\n    _op_def: OpDefinition\n    _op_config: Any\n    _resources: "Resources"\n    _resources_config: Mapping[str, Any]\n    _instance: DagsterInstance\n    _log_manager: DagsterLogManager\n    _pdb: Optional[ForkedPdb]\n    _tags: Mapping[str, str]\n    _hook_defs: Optional[AbstractSet[HookDefinition]]\n    _alias: str\n    _user_events: List[UserEvent]\n    _seen_outputs: Dict[str, Union[str, Set[str]]]\n    _output_metadata: Dict[str, Any]\n    _mapping_key: Optional[str]\n    _partition_key: Optional[str]\n    _partition_key_range: Optional[PartitionKeyRange]\n    _assets_def: Optional[AssetsDefinition]\n\n    def __init__(\n        self,\n        op_def: OpDefinition,\n        op_config: Any,\n        resources: "Resources",\n        resources_config: Mapping[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Mapping[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n        mapping_key: Optional[str],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        self._op_def = op_def\n        self._op_config = op_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._op_def.tags, tags) if tags else self._op_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._op_def.name\n        self._resources_config = resources_config\n        self._user_events = user_events\n        self._seen_outputs = {}\n        self._output_metadata = output_metadata\n        self._mapping_key = mapping_key\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._assets_def = assets_def\n        self._requires_typed_event_stream = False\n        self._typed_event_stream_error_message = None\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        run_config: Dict[str, object] = {}\n        if self._op_config:\n            run_config["ops"] = {self._op_def.name: {"config": self._op_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("node_handle", "property"))\n\n    @property\n    def op(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        return self._op_def\n\n    @property\n    def has_assets_def(self) -> bool:\n        return self._assets_def is not None\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        if self._assets_def is None:\n            raise DagsterInvalidPropertyError(\n                f"Op {self.op_def.name} does not have an assets definition."\n            )\n        return self._assets_def\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id,\n            self.log,\n            ScopedResourcesBuilder(resources._asdict()),\n            dagster_type,\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n    def describe_op(self) -> str:\n        if isinstance(self.op_def, OpDefinition):\n            return f'op "{self.op_def.name}"'\n\n        return f'solid "{self.op_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key is not None:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned asset")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n        partitions_def = self.assets_def.partitions_def\n        if partitions_def is None:\n            check.failed("Tried to access partition_key for a non-partitioned asset")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        return cast(\n            Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n        ).time_window_for_partition_key(self.partition_key)\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.mapping_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.op_def.output_defs) == 1:\n            output_def = self.op_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs"\n                " exist. Please provide an output_name to the invocation of"\n                " `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.op_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log output"\n                f" metadata for {output_desc} which has already been yielded. Metadata must be"\n                " logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log metadata"\n                f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n                " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log"\n                    f" metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if output_name not in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    # In this mode no conversion is done on returned values and missing but expected outputs are not\n    # allowed.\n    @property\n    def requires_typed_event_stream(self) -> bool:\n        return self._requires_typed_event_stream\n\n    @property\n    def typed_event_stream_error_message(self) -> Optional[str]:\n        return self._typed_event_stream_error_message\n\n    def set_requires_typed_event_stream(self, *, error_message: Optional[str]) -> None:\n        self._requires_typed_event_stream = True\n        self._typed_event_stream_error_message = error_message\n\n\n
[docs]def build_op_context(\n resources: Optional[Mapping[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n mapping_key: Optional[str] = None,\n _assets_def: Optional[AssetsDefinition] = None,\n) -> UnboundOpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n op_config (Optional[Mapping[str, Any]]): The config to provide to the op.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n mapping_key (Optional[str]): A key representing the mapping key from an upstream dynamic\n output. Can be accessed using ``context.get_mapping_key()``.\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n _assets_def (Optional[AssetsDefinition]): Internal argument that populates the op's assets\n definition, not meant to be populated by users.\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return UnboundOpExecutionContext(\n resources_dict=check.opt_mapping_param(resources, "resources", key_type=str),\n resources_config=check.opt_mapping_param(\n resources_config, "resources_config", key_type=str\n ),\n op_config=op_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n partition_key_range=check.opt_inst_param(\n partition_key_range, "partition_key_range", PartitionKeyRange\n ),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n assets_def=check.opt_inst_param(_assets_def, "_assets_def", AssetsDefinition),\n )
\n\n\n
[docs]def build_asset_context(\n resources: Optional[Mapping[str, Any]] = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n asset_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n):\n """Builds asset execution context from provided parameters.\n\n ``build_asset_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_asset_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking an asset.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n asset_config (Optional[Mapping[str, Any]]): The config to provide to the asset.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n\n Examples:\n .. code-block:: python\n\n context = build_asset_context()\n asset_to_invoke(context)\n\n with build_asset_context(resources={"foo": context_manager_resource}) as context:\n asset_to_invoke(context)\n """\n return build_op_context(\n op_config=asset_config,\n resources=resources,\n resources_config=resources_config,\n partition_key=partition_key,\n partition_key_range=partition_key_range,\n instance=instance,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/invocation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.logger

\nfrom typing import Any, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.LoggerDefinition`.\n\n Users should not instantiate this object directly. To construct an\n `InitLoggerContext` for testing purposes, use :py:func:`dagster.\n build_init_logger_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import logger, InitLoggerContext\n\n @logger\n def hello_world(init_context: InitLoggerContext):\n ...\n\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n job_def: Optional[JobDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._job_def = check.opt_inst_param(job_def, "job_def", JobDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @public\n @property\n def logger_config(self) -> Any:\n """The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`.\n """\n return self._logger_config\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n """The job definition currently being executed."""\n return self._job_def\n\n @public\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n """The logger definition for the logger being constructed."""\n return self._logger_def\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The ID for this run of the job."""\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, job_def: Optional[JobDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, job_def=job_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/_core/execution/context/logger", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.logger"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_layer import AssetOutputInfo\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKey,\n)\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import DagsterInvalidMetadata, DagsterInvariantViolationError\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import JobDefinition, PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.system_config.objects import ResolvedRunConfig\n    from dagster._core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Users should not instantiate this object directly. To construct an\n `OutputContext` for testing an IO Manager's `handle_output` method, use\n :py:func:`dagster.build_output_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, OutputContext\n\n class MyIOManager(IOManager):\n def handle_output(self, context: OutputContext, obj):\n ...\n """\n\n _step_key: Optional[str]\n _name: Optional[str]\n _job_name: Optional[str]\n _run_id: Optional[str]\n _metadata: ArbitraryMetadataMapping\n _user_generated_metadata: Mapping[str, MetadataValue]\n _mapping_key: Optional[str]\n _config: object\n _op_def: Optional["OpDefinition"]\n _dagster_type: Optional["DagsterType"]\n _log: Optional["DagsterLogManager"]\n _version: Optional[str]\n _resource_config: Optional[Mapping[str, object]]\n _step_context: Optional["StepExecutionContext"]\n _asset_info: Optional[AssetOutputInfo]\n _warn_on_step_context_use: bool\n _resources: Optional["Resources"]\n _resources_cm: Optional[ContextManager["Resources"]]\n _resources_contain_cm: Optional[bool]\n _cm_scope_entered: Optional[bool]\n _events: List["DagsterEvent"]\n _user_events: List[Union[AssetMaterialization, AssetObservation]]\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n mapping_key: Optional[str] = None,\n config: object = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Union["Resources", Mapping[str, object]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_info: Optional[AssetOutputInfo] = None,\n warn_on_step_context_use: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._job_name = job_name\n self._run_id = run_id\n self._metadata = metadata or {}\n self._mapping_key = mapping_key\n self._config = config\n self._op_def = op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_info = asset_info\n self._warn_on_step_context_use = warn_on_step_context_use\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events = []\n self._user_events = []\n self._user_generated_metadata = {}\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if (\n hasattr(self, "_resources_cm")\n and self._resources_cm\n and self._resources_contain_cm\n and not self._cm_scope_entered\n ):\n self._resources_cm.__exit__(None, None, None)\n\n @public\n @property\n def step_key(self) -> str:\n """The step_key for the compute step that produced the output."""\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @public\n @property\n def name(self) -> str:\n """The name of the output that produced the output."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run that produced the output."""\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of the metadata that is assigned to the OutputDefinition that produced\n the output.\n """\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> Optional[str]:\n """The key that identifies a unique mapped output. None for regular outputs."""\n return self._mapping_key\n\n @public\n @property\n def config(self) -> Any:\n """The configuration for the output."""\n return self._config\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that produced the output."""\n from dagster._core.definitions import OpDefinition\n\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._op_def)\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this output."""\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this output."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @public\n @property\n def version(self) -> Optional[str]:\n """(Experimental) The version of the output."""\n return self._version\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, object]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the output manager, specified by the `required_resource_keys`\n parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_info(self) -> Optional[AssetOutputInfo]:\n """(Experimental) Asset info corresponding to the output."""\n return self._asset_info\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being stored, otherwise returns False. A return value of False\n indicates that an output from an op is being stored.\n """\n return self._asset_info is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being stored as an output."""\n if self._asset_info is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._asset_info.key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the asset corresponding to this output."""\n asset_key = self.asset_key\n result = self.step_context.job_def.asset_layer.partitions_def_for_asset(asset_key)\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.step_context"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being stored is partitioned."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_asset_partitions"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key_range"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for the output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_keys"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.asset_partitions_def.get_partition_keys_in_range(\n self.step_context.asset_partition_key_range_for_output(self.name),\n dynamic_partitions_store=self.step_context.instance,\n )\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partitions_time_window"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n def get_run_scoped_output_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step"\n f" '{step_key}'. Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n def get_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n\n return self.get_identifier()\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being stored as an output.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])`` materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset output identifier for an output with no asset key")
\n\n def get_asset_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_asset_output_identifier` is deprecated. Use "\n "`OutputContext.get_asset_identifier` instead."\n )\n\n return self.get_asset_identifier()\n\n
[docs] @public\n def log_event(self, event: Union[AssetObservation, AssetMaterialization]) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation]): The event to log.\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster._core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization)):\n if self._step_context:\n self._events.append(DagsterEvent.asset_materialization(self._step_context, event))\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed(f"Unexpected event {event}")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def get_logged_events(\n self,\n ) -> Sequence[Union[AssetMaterialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n return self._user_events\n\n
[docs] @public\n def add_output_metadata(self, metadata: Mapping[str, RawMetadataValue]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Mapping[str, RawMetadataValue]): A metadata dictionary to log\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster._core.definitions.metadata import normalize_metadata\n\n overlapping_labels = set(self._user_generated_metadata.keys()) & metadata.keys()\n if overlapping_labels:\n raise DagsterInvalidMetadata(\n f"Tried to add metadata for key(s) that already have metadata: {overlapping_labels}"\n )\n\n self._user_generated_metadata = {\n **self._user_generated_metadata,\n **normalize_metadata(metadata),\n }
\n\n def get_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Get the mapping of metadata entries that have been logged for use with this output."""\n return self._user_generated_metadata\n\n def consume_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata has not yet been called, this will yield all logged events since\n the call to `handle_output`. If consume_logged_metadata has been called, it will yield all\n events since the last time consume_logged_metadata_entries was called. Designed for internal\n use. Users should never need to invoke this method.\n """\n result = self._user_generated_metadata\n self._user_generated_metadata = {}\n return result or {}
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n job_def: "JobDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n warn_on_step_context_use: bool = False,\n) -> "OutputContext":\n """Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n op_config = resolved_run_config.ops[step.node_handle.to_string()]\n outputs_config = op_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = job_def.get_node(step_output.node_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n node_handle = execution_plan.get_step_by_key(step.key).node_handle\n asset_info = job_def.asset_layer.asset_info_for_output(\n node_handle=node_handle, output_name=step_output.name\n )\n if asset_info is not None:\n metadata = job_def.asset_layer.metadata_for_asset(asset_info.key) or output_def.metadata\n else:\n metadata = output_def.metadata\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n job_name=job_def.name,\n run_id=run_id,\n metadata=metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n op_def=job_def.get_node(step.node_handle).definition, # type: ignore # (should be OpDefinition not NodeDefinition)\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n asset_info=asset_info,\n warn_on_step_context_use=warn_on_step_context_use,\n )\n\n\ndef step_output_version(\n job_def: "JobDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster._core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n job_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Mapping[str, object]] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Mapping[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Mapping[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_key: Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\n output.\n partition_key: Optional[str]: String value representing partition key to execute with.\n\n Examples:\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster._core.definitions import OpDefinition\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n\n return OutputContext(\n step_key=step_key,\n name=name,\n job_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n asset_info=AssetOutputInfo(key=asset_key) if asset_key else None,\n partition_key=partition_key,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.output"}, "system": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.system

\n"""This module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module.\n"""\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom hashlib import sha256\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD,\n    extract_data_version_from_entry,\n)\nfrom dagster._core.definitions.dependency import OpNode\nfrom dagster._core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.partition_mapping import (\n    PartitionMapping,\n    infer_partition_mapping,\n)\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    PARTITION_NAME_TAG,\n)\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.types.dagster_type import DagsterType\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.data_version import (\n        DataVersion,\n    )\n    from dagster._core.definitions.dependency import NodeHandle\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.event_api import EventLogRecord\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.state import KnownExecutionState\n    from dagster._core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\ndef is_iterable(obj: Any) -> bool:\n    try:\n        iter(obj)\n    except:\n        return False\n    return True\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def job(self) -> IJob:\n        return self.plan_data.job\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        return self.plan_data.dagster_run\n\n    @property\n    def run_id(self) -> str:\n        return self.dagster_run.run_id\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        return self.dagster_run.run_config\n\n    @property\n    def job_name(self) -> str:\n        return self.dagster_run.job_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self) -> "ExecutionPlan":\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Mapping[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.all_tags()\n\n    @property\n    def event_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.event_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.dagster_run.tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.dagster_run.tags.get(key)\n\n    @property\n    def run_tags(self) -> Mapping[str, str]:\n        return self.dagster_run.tags\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    job: IJob\n    dagster_run: DagsterRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    job_def: JobDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def node_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_job(self) -> ReconstructableJob:\n        if not isinstance(self.job, ReconstructableJob):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructableJob"\n            )\n        return self.job\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n    ):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def node_handle(self) -> "NodeHandle":\n        return self.step.node_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(\n        self,\n        step: ExecutionStep,\n        known_state: Optional["KnownExecutionState"] = None,\n    ) -> IStepContext:\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            known_state=known_state,\n        )\n\n    @property\n    def job_def(self) -> JobDefinition:\n        return self._execution_data.job_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partitions_def(self) -> Optional[PartitionsDefinition]:\n        from dagster._core.definitions.job_definition import JobDefinition\n\n        job_def = self._execution_data.job_def\n        if not isinstance(job_def, JobDefinition):\n            check.failed(\n                "Can only call 'partitions_def', when using jobs, not legacy pipelines",\n            )\n        partitions_def = job_def.partitions_def\n        return partitions_def\n\n    @property\n    def has_partitions(self) -> bool:\n        tags = self._plan_data.dagster_run.tags\n        return bool(\n            PARTITION_NAME_TAG in tags\n            or any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()])\n            or (\n                tags.get(ASSET_PARTITION_RANGE_START_TAG)\n                and tags.get(ASSET_PARTITION_RANGE_END_TAG)\n            )\n        )\n\n    @property\n    def partition_key(self) -> str:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            return get_multipartition_key_from_tags(tags)\n        elif PARTITION_NAME_TAG in tags:\n            return tags[PARTITION_NAME_TAG]\n        else:\n            range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            range_end = tags[ASSET_PARTITION_RANGE_END_TAG]\n\n            if range_start != range_end:\n                raise DagsterInvariantViolationError(\n                    "Cannot access partition_key for a partitioned run with a range of partitions."\n                    " Call partition_key_range instead."\n                )\n            else:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return self.partitions_def.get_partition_key_from_str(cast(str, range_start))\n                return cast(str, range_start)\n\n    @property\n    def asset_partition_key_range(self) -> PartitionKeyRange:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            multipartition_key = get_multipartition_key_from_tags(tags)\n            return PartitionKeyRange(multipartition_key, multipartition_key)\n        elif PARTITION_NAME_TAG in tags:\n            partition_key = tags[PARTITION_NAME_TAG]\n            return PartitionKeyRange(partition_key, partition_key)\n        else:\n            partition_key_range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            if partition_key_range_start is not None:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return PartitionKeyRange(\n                        self.partitions_def.get_partition_key_from_str(partition_key_range_start),\n                        self.partitions_def.get_partition_key_from_str(\n                            tags[ASSET_PARTITION_RANGE_END_TAG]\n                        ),\n                    )\n            return PartitionKeyRange(partition_key_range_start, tags[ASSET_PARTITION_RANGE_END_TAG])\n\n    @property\n    def partition_time_window(self) -> TimeWindow:\n        partitions_def = self.partitions_def\n\n        if partitions_def is None:\n            raise DagsterInvariantViolationError("Partitions definition is not defined")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        if self.has_partition_key:\n            return cast(\n                Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n            ).time_window_for_partition_key(self.partition_key)\n        elif self.has_partition_key_range:\n            partition_key_range = self.asset_partition_key_range\n            partitions_def = cast(\n                Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n            )\n            return TimeWindow(\n                partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n                partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n            )\n\n        else:\n            check.failed(\n                "Has a PartitionsDefinition, so should either have a partition key or a partition"\n                " key range"\n            )\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.dagster_run.tags\n\n    @property\n    def has_partition_key_range(self) -> bool:\n        return ASSET_PARTITION_RANGE_START_TAG in self._plan_data.dagster_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\n@dataclass\nclass InputAssetVersionInfo:\n    # This is the storage id of the last materialization of any partition of an asset. Thus it is\n    # computed the same way for both partitioned and non-partitioned assets.\n    storage_id: int\n\n    # If the input asset is partitioned, this is a hash of the sorted data versions of each dependency\n    # partition. If the input asset is not partitioned, this is the data version of the asset. It\n    # can be none if we are sourcing a materialization from before data versions.\n    data_version: Optional["DataVersion"]\n\n    # This is the run_id on the event that the storage_id references\n    run_id: str\n\n    # This is the timestamp on the event that the storage_id references\n    timestamp: float\n\n\n
[docs]class StepExecutionContext(PlanExecutionContext, IStepContext):\n """Context for the execution of a step. Users should not instantiate this class directly.\n\n This context assumes that user code can be run directly, and thus includes resource and information.\n """\n\n def __init__(\n self,\n plan_data: PlanData,\n execution_data: ExecutionData,\n log_manager: DagsterLogManager,\n step: ExecutionStep,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.resources_init import get_required_resource_keys_for_step\n\n super(StepExecutionContext, self).__init__(\n plan_data=plan_data,\n execution_data=execution_data,\n log_manager=log_manager,\n output_capture=output_capture,\n )\n self._step = step\n self._required_resource_keys = get_required_resource_keys_for_step(\n plan_data.job.get_definition(),\n step,\n plan_data.execution_plan,\n )\n self._resources = execution_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n self._known_state = known_state\n self._input_lineage: List[AssetLineageInfo] = []\n\n resources_iter = cast(Iterable, self._resources)\n\n step_launcher_resources = [\n resource for resource in resources_iter if isinstance(resource, StepLauncher)\n ]\n\n self._step_launcher: Optional[StepLauncher] = None\n if len(step_launcher_resources) > 1:\n raise DagsterInvariantViolationError(\n "Multiple required resources for {described_op} have inherited StepLauncher"\n "There should be at most one step launcher resource per {node_type}.".format(\n described_op=self.describe_op(), node_type=self.op_def.node_type_str\n )\n )\n elif len(step_launcher_resources) == 1:\n self._step_launcher = step_launcher_resources[0]\n\n self._step_exception: Optional[BaseException] = None\n\n self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n # Enable step output capture if there are any hooks which will receive them.\n # Expect in the future that hooks may control whether or not they get outputs,\n # but for now presence of any will cause output capture.\n if self.job_def.get_all_hooks_for_handle(self.node_handle):\n self._step_output_capture = {}\n\n self._output_metadata: Dict[str, Any] = {}\n self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n self._input_asset_version_info: Dict[AssetKey, Optional["InputAssetVersionInfo"]] = {}\n self._is_external_input_asset_version_info_loaded = False\n self._data_version_cache: Dict[AssetKey, "DataVersion"] = {}\n\n self._requires_typed_event_stream = False\n self._typed_event_stream_error_message = None\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._typed_event_stream_error_message\n\n # Error message will be appended to the default error message.\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None):\n self._requires_typed_event_stream = True\n self._typed_event_stream_error_message = error_message\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def node_handle(self) -> "NodeHandle":\n return self.step.node_handle\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n return self._step_launcher\n\n @property\n def op_def(self) -> OpDefinition:\n return self.op.definition\n\n @property\n def job_def(self) -> "JobDefinition":\n return self._execution_data.job_def\n\n @property\n def op(self) -> OpNode:\n return self.job_def.get_op(self._step.node_handle)\n\n @property\n def op_retry_policy(self) -> Optional[RetryPolicy]:\n return self.job_def.get_retry_policy_for_handle(self.node_handle)\n\n def describe_op(self) -> str:\n return f'op "{self.node_handle}"'\n\n def get_io_manager(self, step_output_handle: StepOutputHandle) -> IOManager:\n step_output = self.execution_plan.get_step_output(step_output_handle)\n io_manager_key = (\n self.job_def.get_node(step_output.node_handle)\n .output_def_named(step_output.name)\n .io_manager_key\n )\n\n output_manager = getattr(self.resources, io_manager_key)\n return check.inst(output_manager, IOManager)\n\n def get_output_context(self, step_output_handle: StepOutputHandle) -> OutputContext:\n return get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n step_output_handle,\n self._get_source_run_id(step_output_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n )\n\n def for_input_manager(\n self,\n name: str,\n config: Any,\n metadata: Any,\n dagster_type: DagsterType,\n source_handle: Optional[StepOutputHandle] = None,\n resource_config: Any = None,\n resources: Optional["Resources"] = None,\n artificial_output_context: Optional["OutputContext"] = None,\n ) -> InputContext:\n if source_handle and artificial_output_context:\n check.failed("Cannot specify both source_handle and artificial_output_context.")\n\n upstream_output: Optional[OutputContext] = None\n\n if source_handle is not None:\n version = self.execution_plan.get_version_for_step_output_handle(source_handle)\n\n # NOTE: this is using downstream step_context for upstream OutputContext. step_context\n # will be set to None for 0.15 release.\n upstream_output = get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n source_handle,\n self._get_source_run_id(source_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=version,\n warn_on_step_context_use=True,\n )\n else:\n upstream_output = artificial_output_context\n\n asset_key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.node_handle, input_name=name\n )\n asset_partitions_subset = (\n self.asset_partitions_subset_for_input(name)\n if self.has_asset_partitions_for_input(name)\n else None\n )\n\n asset_partitions_def = (\n self.job_def.asset_layer.partitions_def_for_asset(asset_key) if asset_key else None\n )\n return InputContext(\n job_name=self.job_def.name,\n name=name,\n op_def=self.op_def,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=self.log,\n step_context=self,\n resource_config=resource_config,\n resources=resources,\n asset_key=asset_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=self.instance,\n )\n\n def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n from .hook import HookContext\n\n return HookContext(self, hook_def)\n\n def get_known_state(self) -> "KnownExecutionState":\n if not self._known_state:\n check.failed(\n "Attempted to access KnownExecutionState but it was not provided at context"\n " creation"\n )\n return self._known_state\n\n def can_load(\n self,\n step_output_handle: StepOutputHandle,\n ) -> bool:\n # can load from upstream in the same run\n if step_output_handle in self.get_known_state().ready_outputs:\n return True\n\n if (\n self._should_load_from_previous_runs(step_output_handle)\n # should and can load from a previous run\n and self._get_source_run_id_from_logs(step_output_handle)\n ):\n return True\n\n return False\n\n def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n if mapping_key:\n if output_name not in self._seen_outputs:\n self._seen_outputs[output_name] = set()\n cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n else:\n self._seen_outputs[output_name] = "seen"\n\n def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n if mapping_key:\n return (\n output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n )\n return output_name in self._seen_outputs\n\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n if output_name is None and len(self.op_def.output_defs) == 1:\n output_def = self.op_def.output_defs[0]\n output_name = output_def.name\n elif output_name is None:\n raise DagsterInvariantViolationError(\n "Attempted to log metadata without providing output_name, but multiple outputs"\n " exist. Please provide an output_name to the invocation of"\n " `context.add_output_metadata`."\n )\n else:\n output_def = self.op_def.output_def_named(output_name)\n\n if self.has_seen_output(output_name, mapping_key):\n output_desc = (\n f"output '{output_def.name}'"\n if not mapping_key\n else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n )\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log output"\n f" metadata for {output_desc} which has already been yielded. Metadata must be"\n " logged before the output is yielded."\n )\n if output_def.is_dynamic and not mapping_key:\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log metadata"\n f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n )\n\n if mapping_key:\n if output_name not in self._output_metadata:\n self._output_metadata[output_name] = {}\n if mapping_key in self._output_metadata[output_name]:\n self._output_metadata[output_name][mapping_key].update(metadata)\n else:\n self._output_metadata[output_name][mapping_key] = metadata\n else:\n if output_name in self._output_metadata:\n self._output_metadata[output_name].update(metadata)\n else:\n self._output_metadata[output_name] = metadata\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n metadata = self._output_metadata.get(output_name)\n if mapping_key and metadata:\n return metadata.get(mapping_key)\n return metadata\n\n def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n # walk through event logs to find the right run_id based on the run lineage\n\n parent_state = self.get_known_state().parent_state\n while parent_state:\n # if the parent run has yielded an StepOutput event for the given step output,\n # we find the source run id\n if step_output_handle in parent_state.produced_outputs:\n return parent_state.run_id\n\n # else, keep looking backwards\n parent_state = parent_state.get_parent_state()\n\n # When a fixed path is provided via io manager, it's able to run step subset using an execution\n # plan when the ascendant outputs were not previously created by dagster-controlled\n # computations. for example, in backfills, with fixed path io manager, we allow users to\n # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n # Warn about this special case because it will also reach here when all previous runs have\n # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n self.log.warning(\n f"No previously stored outputs found for source {step_output_handle}. "\n "This is either because you are using an IO Manager that does not depend on run ID, "\n "or because all the previous runs have skipped the output in conditional execution."\n )\n return None\n\n def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n # should not load if not a re-execution\n if self.dagster_run.parent_run_id is None:\n return False\n # should not load if re-executing the entire pipeline\n if self.dagster_run.step_keys_to_execute is None:\n return False\n\n # should not load if the entire dynamic step is being executed in the current run\n handle = StepHandle.parse_from_key(step_output_handle.step_key)\n if (\n isinstance(handle, ResolvedFromDynamicStepHandle)\n and handle.unresolved_form.to_key() in self.dagster_run.step_keys_to_execute\n ):\n return False\n\n # should not load if this step is being executed in the current run\n return step_output_handle.step_key not in self.dagster_run.step_keys_to_execute\n\n def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n if self._should_load_from_previous_runs(step_output_handle):\n return self._get_source_run_id_from_logs(step_output_handle)\n else:\n return self.dagster_run.run_id\n\n def capture_step_exception(self, exception: BaseException):\n self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n @property\n def step_exception(self) -> Optional[BaseException]:\n return self._step_exception\n\n @property\n def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n return self._step_output_capture\n\n @property\n def previous_attempt_count(self) -> int:\n return self.get_known_state().get_retry_state().get_attempt_count(self._step.key)\n\n @property\n def op_config(self) -> Any:\n op_config = self.resolved_run_config.ops.get(str(self.node_handle))\n return op_config.config if op_config else None\n\n @property\n def is_op_in_graph(self) -> bool:\n """Whether this step corresponds to an op within a graph (either @graph, or @graph_asset)."""\n return self.step.node_handle.parent is not None\n\n @property\n def is_sda_step(self) -> bool:\n """Whether this step corresponds to a software define asset, inferred by presence of asset info on outputs.\n\n note: ops can materialize assets as well.\n """\n for output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output.name\n )\n if asset_info is not None:\n return True\n return False\n\n def set_data_version(self, asset_key: AssetKey, data_version: "DataVersion") -> None:\n self._data_version_cache[asset_key] = data_version\n\n def has_data_version(self, asset_key: AssetKey) -> bool:\n return asset_key in self._data_version_cache\n\n def get_data_version(self, asset_key: AssetKey) -> "DataVersion":\n return self._data_version_cache[asset_key]\n\n @property\n def input_asset_records(self) -> Optional[Mapping[AssetKey, Optional["InputAssetVersionInfo"]]]:\n return self._input_asset_version_info\n\n @property\n def is_external_input_asset_version_info_loaded(self) -> bool:\n return self._is_external_input_asset_version_info_loaded\n\n def get_input_asset_version_info(self, key: AssetKey) -> Optional["InputAssetVersionInfo"]:\n if key not in self._input_asset_version_info:\n self._fetch_input_asset_version_info(key)\n return self._input_asset_version_info[key]\n\n # "external" refers to records for inputs generated outside of this step\n def fetch_external_input_asset_version_info(self) -> None:\n output_keys = self.get_output_asset_keys()\n\n all_dep_keys: List[AssetKey] = []\n for output_key in output_keys:\n if output_key not in self.job_def.asset_layer.asset_deps:\n continue\n dep_keys = self.job_def.asset_layer.upstream_assets_for_asset(output_key)\n for key in dep_keys:\n if key not in all_dep_keys and key not in output_keys:\n all_dep_keys.append(key)\n\n self._input_asset_version_info = {}\n for key in all_dep_keys:\n self._fetch_input_asset_version_info(key)\n self._is_external_input_asset_version_info_loaded = True\n\n def _fetch_input_asset_version_info(self, key: AssetKey) -> None:\n from dagster._core.definitions.data_version import (\n extract_data_version_from_entry,\n )\n\n event = self._get_input_asset_event(key)\n if event is None:\n self._input_asset_version_info[key] = None\n else:\n storage_id = event.storage_id\n # Input name will be none if this is an internal dep\n input_name = self.job_def.asset_layer.input_for_asset_key(self.node_handle, key)\n # Exclude AllPartitionMapping for now to avoid huge queries\n if input_name and self.has_asset_partitions_for_input(input_name):\n subset = self.asset_partitions_subset_for_input(\n input_name, require_valid_partitions=False\n )\n input_keys = list(subset.get_partition_keys())\n\n # This check represents a temporary constraint that prevents huge query results for upstream\n # partition data versions from timing out runs. If a partitioned dependency (a) uses an\n # AllPartitionMapping; and (b) has greater than or equal to\n # SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD dependency partitions, then we\n # process it as a non-partitioned dependency (note that this was the behavior for\n # all partition dependencies prior to 2023-08). This means that stale status\n # results cannot be accurately computed for the dependency, and there is thus\n # corresponding logic in the CachingStaleStatusResolver to account for this. This\n # constraint should be removed when we have thoroughly examined the performance of\n # the data version retrieval query and can guarantee decent performance.\n if len(input_keys) < SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD:\n data_version = self._get_partitions_data_version_from_keys(key, input_keys)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n self._input_asset_version_info[key] = InputAssetVersionInfo(\n storage_id, data_version, event.run_id, event.timestamp\n )\n\n def partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n if upstream_asset_key:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n partitions_def = assets_def.partitions_def if assets_def else None\n explicit_partition_mapping = self.job_def.asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n )\n return infer_partition_mapping(\n explicit_partition_mapping,\n partitions_def,\n upstream_asset_partitions_def,\n )\n else:\n return None\n\n def _get_input_asset_event(self, key: AssetKey) -> Optional["EventLogRecord"]:\n event = self.instance.get_latest_data_version_record(key)\n if event:\n self._check_input_asset_event(key, event)\n return event\n\n def _check_input_asset_event(self, key: AssetKey, event: "EventLogRecord") -> None:\n assert event.event_log_entry\n event_data_version = extract_data_version_from_entry(event.event_log_entry)\n if key in self._data_version_cache and self._data_version_cache[key] != event_data_version:\n self.log.warning(\n f"Data version mismatch for asset {key}. Data version from materialization within"\n f" current step is `{self._data_version_cache[key]}`. Data version from most recent"\n f" materialization is `{event_data_version}`. Most recent materialization will be"\n " used for provenance tracking."\n )\n\n def _get_partitions_data_version_from_keys(\n self, key: AssetKey, partition_keys: Sequence[str]\n ) -> "DataVersion":\n from dagster._core.definitions.data_version import (\n DataVersion,\n )\n from dagster._core.events import DagsterEventType\n\n # TODO: this needs to account for observations also\n event_type = DagsterEventType.ASSET_MATERIALIZATION\n tags_by_partition = (\n self.instance._event_storage.get_latest_tags_by_partition( # noqa: SLF001\n key, event_type, [DATA_VERSION_TAG], asset_partitions=list(partition_keys)\n )\n )\n partition_data_versions = [\n pair[1][DATA_VERSION_TAG]\n for pair in sorted(tags_by_partition.items(), key=lambda x: x[0])\n ]\n hash_sig = sha256()\n hash_sig.update(bytearray("".join(partition_data_versions), "utf8"))\n return DataVersion(hash_sig.hexdigest())\n\n # Call this to clear the cache for an input asset record. This is necessary when an old\n # materialization for an asset was loaded during `fetch_external_input_asset_records` because an\n # intrastep asset is not required, but then that asset is materialized during the step. If we\n # don't clear the cache for this asset, then we won't use the most up-to-date asset record.\n def wipe_input_asset_version_info(self, key: AssetKey) -> None:\n if key in self._input_asset_version_info:\n del self._input_asset_version_info[key]\n\n def get_output_asset_keys(self) -> AbstractSet[AssetKey]:\n output_keys: Set[AssetKey] = set()\n for step_output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, step_output.name\n )\n if asset_info is None or not asset_info.is_required:\n continue\n output_keys.add(asset_info.key)\n return output_keys\n\n def has_asset_partitions_for_input(self, input_name: str) -> bool:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n return (\n upstream_asset_key is not None\n and asset_layer.partitions_def_for_asset(upstream_asset_key) is not None\n )\n\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n subset = self.asset_partitions_subset_for_input(input_name)\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset partition key range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n def asset_partitions_subset_for_input(\n self, input_name: str, *, require_valid_partitions: bool = True\n ) -> PartitionsSubset:\n asset_layer = self.job_def.asset_layer\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is not None:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if upstream_asset_partitions_def is not None:\n partitions_def = assets_def.partitions_def if assets_def else None\n partitions_subset = (\n partitions_def.empty_subset().with_partition_key_range(\n self.asset_partition_key_range, dynamic_partitions_store=self.instance\n )\n if partitions_def\n else None\n )\n partition_mapping = infer_partition_mapping(\n asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n ),\n partitions_def,\n upstream_asset_partitions_def,\n )\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n partitions_subset,\n upstream_asset_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n if (\n require_valid_partitions\n and mapped_partitions_result.required_but_nonexistent_partition_keys\n ):\n raise DagsterInvariantViolationError(\n f"Partition key range {self.asset_partition_key_range} in"\n f" {self.node_handle.name} depends on invalid partition keys"\n f" {mapped_partitions_result.required_but_nonexistent_partition_keys} in"\n f" upstream asset {upstream_asset_key}"\n )\n\n return mapped_partitions_result.partitions_subset\n\n check.failed("The input has no asset partitions")\n\n def asset_partition_key_for_input(self, input_name: str) -> str:\n start, end = self.asset_partition_key_range_for_input(input_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for input '{input_name}' of step '{self.step.key}',"\n f" but the step input has a partition range: '{start}' to '{end}'."\n )\n\n def _partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.node_handle, output_name=output_name\n )\n if asset_info:\n return asset_info.partitions_def\n else:\n return None\n\n def partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n return self._partitions_def_for_output(output_name)\n\n def has_asset_partitions_for_output(self, output_name: str) -> bool:\n return self._partitions_def_for_output(output_name) is not None\n\n def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n if self._partitions_def_for_output(output_name) is not None:\n return self.asset_partition_key_range\n\n check.failed("The output has no asset partitions")\n\n def asset_partition_key_for_output(self, output_name: str) -> str:\n start, end = self.asset_partition_key_range_for_output(output_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for output '{output_name}' of step"\n f" '{self.step.key}', but the step output has a partition range: '{start}' to"\n f" '{end}'."\n )\n\n def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given output.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n partitions_def = self._partitions_def_for_output(output_name)\n\n if not partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an output that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n )\n partition_key_range = self.asset_partition_key_range_for_output(output_name)\n return TimeWindow(\n # mypy thinks partitions_def is <nothing> here because ????\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n def asset_partitions_time_window_for_input(self, input_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given input.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is None:\n raise ValueError("The input has no corresponding asset")\n\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if not upstream_asset_partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an input that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(upstream_asset_partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n upstream_asset_partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition],\n upstream_asset_partitions_def,\n )\n partition_key_range = self.asset_partition_key_range_for_input(input_name)\n\n return TimeWindow(\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.start\n ).start,\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.end\n ).end,\n )\n\n def get_type_loader_context(self) -> "DagsterTypeLoaderContext":\n return DagsterTypeLoaderContext(\n plan_data=self.plan_data,\n execution_data=self._execution_data,\n log_manager=self._log_manager,\n step=self.step,\n output_capture=self._output_capture,\n known_state=self._known_state,\n )\n\n def output_observes_source_asset(self, output_name: str) -> bool:\n """Returns True if this step observes a source asset."""\n asset_layer = self.job_def.asset_layer\n if asset_layer is None:\n return False\n asset_key = asset_layer.asset_key_for_output(self.node_handle, output_name)\n if asset_key is None:\n return False\n return asset_layer.is_observable_for_asset(asset_key)
\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType."""\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @public\n @property\n def resources(self) -> "Resources":\n """An object whose attributes contain the resources available to this op."""\n return self._resources\n\n @public\n @property\n def run_id(self) -> str:\n """The id of this job run."""\n return self._run_id\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._log
\n\n\n
[docs]class DagsterTypeLoaderContext(StepExecutionContext):\n """The context object provided to a :py:class:`@dagster_type_loader <dagster_type_loader>`-decorated function during execution.\n\n Users should not construct this object directly.\n """\n\n @public\n @property\n def resources(self) -> "Resources":\n """The resources available to the type loader, specified by the `required_resource_keys` argument of the decorator."""\n return super(DagsterTypeLoaderContext, self).resources\n\n @public\n @property\n def job_def(self) -> "JobDefinition":\n """The underlying job definition being executed."""\n return super(DagsterTypeLoaderContext, self).job_def\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The op for which type loading is occurring."""\n return super(DagsterTypeLoaderContext, self).op_def
\n
", "current_page_name": "_modules/dagster/_core/execution/context/system", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.execute_in_process_result

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class ExecuteInProcessResult(ExecutionResult):\n """Result object returned by in-process testing APIs.\n\n Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.\n\n This object is returned by:\n - :py:meth:`dagster.GraphDefinition.execute_in_process`\n - :py:meth:`dagster.JobDefinition.execute_in_process`\n - :py:meth:`dagster.materialize_to_memory`\n - :py:meth:`dagster.materialize`\n """\n\n _handle: NodeHandle\n _event_list: Sequence[DagsterEvent]\n _dagster_run: DagsterRun\n _output_capture: Mapping[StepOutputHandle, Any]\n _job_def: JobDefinition\n\n def __init__(\n self,\n event_list: Sequence[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Mapping[StepOutputHandle, Any]],\n job_def: JobDefinition,\n ):\n self._job_def = job_def\n\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_mapping_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run ID of the executed :py:class:`DagsterRun`."""\n return self.dagster_run.run_id\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n mapped_outputs = {}\n step_key = str(handle)\n output_found = False\n for step_output_handle, value in self._output_capture.items():\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if (\n step_output_handle.step_key.startswith(f"{step_key}[")\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return self._output_capture[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(\n f"No outputs found for output '{output_name}' from node '{handle}'."\n )\n return mapped_outputs\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_for_node(\n node_str, output_name=output_name\n )
\n\n
[docs] @public\n def asset_value(self, asset_key: CoercibleToAssetKey) -> Any:\n """Retrieves the value of an asset that was materialized during the execution of the job.\n\n Args:\n asset_key (CoercibleToAssetKey): The key of the asset to retrieve.\n\n Returns:\n Any: The value of the retrieved asset.\n """\n node_output_handle = self._job_def.asset_layer.node_output_handle_for_asset(\n AssetKey.from_coercible(asset_key)\n )\n return self.output_for_node(\n node_str=str(node_output_handle.node_handle), output_name=node_output_handle.output_name\n )
\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_value(output_name=output_name)
\n
", "current_page_name": "_modules/dagster/_core/execution/execute_in_process_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.execute_in_process_result"}, "job_execution_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.job_execution_result

\nfrom typing import Any, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class JobExecutionResult(ExecutionResult):\n """Result object returned by :py:func:`dagster.execute_job`.\n\n Used for retrieving run success, events, and outputs from `execute_job`.\n Users should not directly instantiate this class.\n\n Events and run information can be retrieved off of the object directly. In\n order to access outputs, the `ExecuteJobResult` object needs to be opened\n as a context manager, which will re-initialize the resources from\n execution.\n """\n\n def __init__(self, job_def, reconstruct_context, event_list, dagster_run):\n self._job_def = job_def\n self._reconstruct_context = reconstruct_context\n self._context = None\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n def __enter__(self) -> "JobExecutionResult":\n context = self._reconstruct_context.__enter__()\n self._context = context\n return self\n\n def __exit__(self, *exc):\n exit_result = self._reconstruct_context.__exit__(*exc)\n self._context = None\n return exit_result\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """Sequence[DagsterEvent]: List of all events yielded by the job execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the Dagster run that was executed."""\n return self.dagster_run.run_id\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`. If the top-level job has no output, calling this method will also result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_value(output_name=output_name)
\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the run of the job.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_for_node(node_str, output_name=output_name)
\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n if not self._context:\n raise DagsterInvariantViolationError(\n "In order to access output objects, the result of `execute_job` must be opened as a"\n " context manager: 'with execute_job(...) as result:"\n )\n found = False\n result = None\n for compute_step_event in self.compute_events_for_handle(handle):\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = self._context.execution_plan.get_step_by_key(compute_step_event.step_key)\n dagster_type = (\n self.job_def.get_node(handle).output_def_named(output_name).dagster_type\n )\n value = self._get_value(self._context.for_step(step), output, dagster_type)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = (\n value # pylint:disable=unsupported-assignment-operation\n )\n else:\n result = value\n\n if found:\n return result\n\n node = self.job_def.get_node(handle)\n raise DagsterInvariantViolationError(\n f"Did not find result {output_name} in {node.describe_node()}"\n )\n\n def _get_value(self, context, step_output_data, dagster_type):\n step_output_handle = step_output_data.step_output_handle\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.job_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res
\n
", "current_page_name": "_modules/dagster/_core/execution/job_execution_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.job_execution_result"}, "validate_run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.validate_run_config

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions import JobDefinition\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: JobDefinition,\n run_config: Optional[Union[Mapping[str, Any], RunConfig]] = None,\n) -> Mapping[str, Any]:\n """Function to validate a provided run config blob against a given job.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (JobDefinition): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n check.inst_param(job_def, "job_def", JobDefinition)\n run_config = check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n )\n\n return ResolvedRunConfig.build(job_def, run_config).to_dict()
\n
", "current_page_name": "_modules/dagster/_core/execution/validate_run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.validate_run_config"}, "with_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.with_resources

\nfrom typing import Any, Iterable, List, Mapping, Optional, Sequence, TypeVar, cast\n\nfrom dagster import _check as check\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..._config import Shape\nfrom ..definitions.resource_requirement import ResourceAddable\nfrom ..definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom ..errors import DagsterInvalidConfigError, DagsterInvalidInvocationError\n\nT = TypeVar("T", bound=ResourceAddable)\n\n\n
[docs]def with_resources(\n definitions: Iterable[T],\n resource_defs: Mapping[str, object],\n resource_config_by_key: Optional[Mapping[str, Any]] = None,\n) -> Sequence[T]:\n """Adds dagster resources to copies of resource-requiring dagster definitions.\n\n An error will be thrown if any provided definitions have a conflicting\n resource definition provided for a key provided to resource_defs. Resource\n config can be provided, with keys in the config dictionary corresponding to\n the keys for each resource definition. If any definition has unsatisfied\n resource keys after applying with_resources, an error will be thrown.\n\n Args:\n definitions (Iterable[ResourceAddable]): Dagster definitions to provide resources to.\n resource_defs (Mapping[str, object]):\n Mapping of resource keys to objects to satisfy\n resource requirements of provided dagster definitions.\n resource_config_by_key (Optional[Mapping[str, Any]]):\n Specifies config for provided resources. The key in this dictionary\n corresponds to configuring the same key in the resource_defs\n dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset, resource, with_resources\n\n @resource(config_schema={"bar": str})\n def foo_resource():\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset1(context):\n foo = context.resources.foo\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset2(context):\n foo = context.resources.foo\n ...\n\n asset1_with_foo, asset2_with_foo = with_resources(\n [the_asset, other_asset],\n resource_config_by_key={\n "foo": {\n "config": {"bar": ...}\n }\n }\n )\n """\n from dagster._config import validate_config\n from dagster._core.definitions.job_definition import (\n default_job_io_manager_with_fs_io_manager_schema,\n )\n\n check.mapping_param(resource_defs, "resource_defs")\n resource_config_by_key = check.opt_mapping_param(\n resource_config_by_key, "resource_config_by_key"\n )\n\n resource_defs = wrap_resources_for_execution(\n merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n resource_defs,\n )\n )\n\n for key, resource_def in resource_defs.items():\n if key in resource_config_by_key:\n resource_config = resource_config_by_key[key]\n if not isinstance(resource_config, dict) or "config" not in resource_config:\n raise DagsterInvalidInvocationError(\n f"Error with config for resource key '{key}': Expected a "\n "dictionary of the form {'config': ...}, but received "\n f"{resource_config}"\n )\n\n outer_config_shape = Shape({"config": resource_def.get_config_field()})\n config_evr = validate_config(outer_config_shape, resource_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error when applying config for resource with key '{key}' ",\n config_evr.errors,\n resource_config,\n )\n resource_defs[key] = resource_defs[key].configured(resource_config["config"])\n\n transformed_defs: List[T] = []\n for definition in definitions:\n transformed_defs.append(cast(T, definition.with_resources(resource_defs)))\n\n return transformed_defs
\n
", "current_page_name": "_modules/dagster/_core/execution/with_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.with_resources"}}, "executor": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator\n\nfrom dagster._annotations import public\nfrom dagster._core.execution.retries import RetryMode\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import PlanOrchestrationContext\n    from dagster._core.execution.plan.plan import ExecutionPlan\n\n\n
[docs]class Executor(ABC):\n
[docs] @public\n @abstractmethod\n def execute(\n self, plan_context: "PlanOrchestrationContext", execution_plan: "ExecutionPlan"\n ) -> Iterator["DagsterEvent"]:\n """For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @public\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/_core/executor/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.base"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.init

\nfrom typing import Mapping, NamedTuple\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions import ExecutorDefinition, IJob\nfrom dagster._core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", PublicAttr[IJob]),\n ("executor_def", PublicAttr[ExecutorDefinition]),\n ("executor_config", PublicAttr[Mapping[str, object]]),\n ("instance", PublicAttr[DagsterInstance]),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IJob): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IJob,\n executor_def: ExecutorDefinition,\n executor_config: Mapping[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IJob),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.mapping_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/executor/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.init"}}, "instance": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance

\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport weakref\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom types import TracebackType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom typing_extensions import Protocol, Self, TypeAlias, TypeVar, runtime_checkable\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.data_version import extract_data_provenance_from_entry\nfrom dagster._core.definitions.events import AssetKey, AssetObservation\nfrom dagster._core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster._core.log_manager import DagsterLogRecord\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    DagsterRunStatsSnapshot,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    PARENT_RUN_ID_TAG,\n    PARTITION_NAME_TAG,\n    RESUME_RETRY_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import PrintFn, traced\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    experimental_warning,\n)\n\nfrom .config import (\n    DAGSTER_CONFIG_YAML_FILENAME,\n    DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT,\n    get_default_tick_retention_settings,\n    get_tick_retention_settings,\n)\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n# Our internal guts can handle empty strings for job name and run id\n# However making these named constants for documentation, to encode where we are making the assumption,\n# and to allow us to change this more easily in the future, provided we are disciplined about\n# actually using this constants.\nRUNLESS_RUN_ID = ""\nRUNLESS_JOB_NAME = ""\n\nif TYPE_CHECKING:\n    from dagster._core.debug import DebugRunPayload\n    from dagster._core.definitions.asset_check_spec import AssetCheckKey\n    from dagster._core.definitions.job_definition import (\n        JobDefinition,\n    )\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.run_request import InstigatorType\n    from dagster._core.event_api import EventHandlerFn\n    from dagster._core.events import (\n        AssetMaterialization,\n        DagsterEvent,\n        DagsterEventType,\n        EngineEventData,\n    )\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.resume_retry import ReexecutionStrategy\n    from dagster._core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster._core.host_representation import (\n        CodeLocation,\n        ExternalJob,\n        ExternalJobOrigin,\n        ExternalSensor,\n        HistoricalJob,\n    )\n    from dagster._core.host_representation.external import ExternalSchedule\n    from dagster._core.launcher import RunLauncher\n    from dagster._core.run_coordinator import RunCoordinator\n    from dagster._core.scheduler import Scheduler, SchedulerDebugInfo\n    from dagster._core.scheduler.instigation import (\n        InstigatorState,\n        InstigatorStatus,\n        InstigatorTick,\n        TickData,\n        TickStatus,\n    )\n    from dagster._core.secrets import SecretsLoader\n    from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n    from dagster._core.storage.asset_check_execution_record import AssetCheckInstanceSupport\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.daemon_cursor import DaemonCursorStorage\n    from dagster._core.storage.event_log import EventLogStorage\n    from dagster._core.storage.event_log.base import (\n        AssetRecord,\n        EventLogConnection,\n        EventLogRecord,\n        EventRecordsFilter,\n    )\n    from dagster._core.storage.partition_status_cache import (\n        AssetPartitionStatus,\n        AssetStatusCacheValue,\n    )\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs import RunStorage\n    from dagster._core.storage.schedules import ScheduleStorage\n    from dagster._core.storage.sql import AlembicVersion\n    from dagster._core.workspace.workspace import IWorkspace\n    from dagster._daemon.types import DaemonHeartbeat, DaemonStatus\n\n\nDagsterInstanceOverrides: TypeAlias = Mapping[str, Any]\n\n\ndef _check_run_equality(\n    pipeline_run: DagsterRun, candidate_run: DagsterRun\n) -> Mapping[str, Tuple[Any, Any]]:\n    field_diff: Dict[str, Tuple[Any, Any]] = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Mapping[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance: "DagsterInstance"):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record: DagsterLogRecord) -> None:\n        from dagster._core.events import EngineEventData\n        from dagster._core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,  # type: ignore\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {e}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    job_name=event.job_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nT_DagsterInstance = TypeVar("T_DagsterInstance", bound="DagsterInstance", default="DagsterInstance")\n\n\nclass MayHaveInstanceWeakref(Generic[T_DagsterInstance]):\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    _instance_weakref: "Optional[weakref.ReferenceType[T_DagsterInstance]]"\n\n    def __init__(self):\n        self._instance_weakref = None\n\n    @property\n    def has_instance(self) -> bool:\n        return hasattr(self, "_instance_weakref") and (self._instance_weakref is not None)\n\n    @property\n    def _instance(self) -> T_DagsterInstance:\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        if instance is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to resolve undefined DagsterInstance weakref."\n            )\n        else:\n            return instance\n\n    def register_instance(self, instance: T_DagsterInstance) -> None:\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n@runtime_checkable\nclass DynamicPartitionsStore(Protocol):\n    @abstractmethod\n    def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]: ...\n\n    @abstractmethod\n    def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool: ...\n\n\n
[docs]class DagsterInstance(DynamicPartitionsStore):\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for dagster storage, you can write a ``dagster.yaml`` such as the\n following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster._core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (Optional[ComputeLogManager]): The compute log manager handles stdout\n and stderr logging for op compute functions. By default, this will be a\n :py:class:`dagster._core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (Optional[RunCoordinator]): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n # Stores TemporaryDirectory instances that were created for DagsterInstance.local_temp() calls\n # to be removed once the instance is garbage collected.\n _TEMP_DIRS: "weakref.WeakKeyDictionary[DagsterInstance, TemporaryDirectory]" = (\n weakref.WeakKeyDictionary()\n )\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n run_coordinator: Optional["RunCoordinator"],\n compute_log_manager: Optional["ComputeLogManager"],\n run_launcher: Optional["RunLauncher"],\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Mapping[str, Any]] = None,\n secrets_loader: Optional["SecretsLoader"] = None,\n ref: Optional[InstanceRef] = None,\n **_kwargs: Any, # we accept kwargs for forward-compat of custom instances\n ):\n from dagster._core.launcher import RunLauncher\n from dagster._core.run_coordinator import RunCoordinator\n from dagster._core.scheduler import Scheduler\n from dagster._core.secrets import SecretsLoader\n from dagster._core.storage.captured_log_manager import CapturedLogManager\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n from dagster._core.storage.event_log import EventLogStorage\n from dagster._core.storage.root import LocalArtifactStorage\n from dagster._core.storage.runs import RunStorage\n from dagster._core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n if compute_log_manager:\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n if not isinstance(self._compute_log_manager, CapturedLogManager):\n deprecation_warning(\n "ComputeLogManager",\n "1.2.0",\n "Implement the CapturedLogManager interface instead.",\n )\n self._compute_log_manager.register_instance(self)\n else:\n check.invariant(\n ref, "Compute log manager must be provided if instance is not from a ref"\n )\n self._compute_log_manager = None\n\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n if run_coordinator:\n self._run_coordinator = check.inst_param(\n run_coordinator, "run_coordinator", RunCoordinator\n )\n self._run_coordinator.register_instance(self)\n else:\n check.invariant(ref, "Run coordinator must be provided if instance is not from a ref")\n self._run_coordinator = None\n\n if run_launcher:\n self._run_launcher: Optional[RunLauncher] = check.inst_param(\n run_launcher, "run_launcher", RunLauncher\n )\n run_launcher.register_instance(self)\n else:\n check.invariant(ref, "Run launcher must be provided if instance is not from a ref")\n self._run_launcher = None\n\n self._settings = check.opt_mapping_param(settings, "settings")\n\n self._secrets_loader = check.opt_inst_param(secrets_loader, "secrets_loader", SecretsLoader)\n\n if self._secrets_loader:\n self._secrets_loader.register_instance(self)\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. Set"\n " max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed"\n " run worker will be marked as failed, but will not be resumed.",\n )\n\n if self.run_retries_enabled:\n check.invariant(\n self.event_log_storage.supports_event_consumer_queries(),\n "Run retries are enabled, but the configured event log storage does not support"\n " them. Consider switching to Postgres or Mysql.",\n )\n\n # ctors\n\n
[docs] @public\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None,\n preload: Optional[Sequence["DebugRunPayload"]] = None,\n settings: Optional[Dict] = None,\n ) -> "DagsterInstance":\n """Create a `DagsterInstance` suitable for ephemeral execution, useful in test contexts. An\n ephemeral instance uses mostly in-memory components. Use `local_temp` to create a test\n instance that is fully persistent.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n preload (Optional[Sequence[DebugRunPayload]]): A sequence of payloads to load into the\n instance's run storage. Useful for debugging.\n settings (Optional[Dict]): Settings for the instance.\n\n Returns:\n DagsterInstance: An ephemeral DagsterInstance.\n """\n from dagster._core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster._core.run_coordinator import DefaultRunCoordinator\n from dagster._core.storage.event_log import InMemoryEventLogStorage\n from dagster._core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster._core.storage.root import LocalArtifactStorage, TemporaryLocalArtifactStorage\n from dagster._core.storage.runs import InMemoryRunStorage\n\n if tempdir is not None:\n local_storage = LocalArtifactStorage(tempdir)\n else:\n local_storage = TemporaryLocalArtifactStorage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=local_storage,\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n settings=settings,\n )
\n\n
[docs] @public\n @staticmethod\n def get() -> "DagsterInstance":\n """Get the current `DagsterInstance` as specified by the ``DAGSTER_HOME`` environment variable.\n\n Returns:\n DagsterInstance: The current DagsterInstance.\n """\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n "The environment variable $DAGSTER_HOME is not set. \\nDagster requires this"\n " environment variable to be set to an existing directory in your filesystem. This"\n " directory is used to store metadata across sessions, or load the dagster.yaml"\n " file which can configure storing metadata in an external database.\\nYou can"\n " resolve this error by exporting the environment variable. For example, you can"\n " run the following command in your shell or include it in your shell configuration"\n ' file:\\n\\texport DAGSTER_HOME=~"/dagster_home"\\nor PowerShell\\n$env:DAGSTER_HOME'\n " = ($home + '\\\\dagster_home')or batchset"\n " DAGSTER_HOME=%UserProfile%/dagster_homeAlternatively, DagsterInstance.ephemeral()"\n " can be used for a transient instance.\\n"\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this'\n " environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)
\n\n
[docs] @public\n @staticmethod\n def local_temp(\n tempdir: Optional[str] = None,\n overrides: Optional[DagsterInstanceOverrides] = None,\n ) -> "DagsterInstance":\n """Create a DagsterInstance that uses a temporary directory for local storage. This is a\n regular, fully persistent instance. Use `ephemeral` to get an ephemeral instance with\n in-memory components.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n overrides (Optional[DagsterInstanceOverrides]): Override settings for the instance.\n\n Returns:\n DagsterInstance\n """\n if tempdir is None:\n created_dir = TemporaryDirectory()\n i = DagsterInstance.from_ref(\n InstanceRef.from_dir(created_dir.name, overrides=overrides)\n )\n DagsterInstance._TEMP_DIRS[i] = created_dir\n return i\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))
\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n unified_storage = instance_ref.storage\n run_storage = unified_storage.run_storage if unified_storage else instance_ref.run_storage\n event_storage = (\n unified_storage.event_log_storage if unified_storage else instance_ref.event_storage\n )\n schedule_storage = (\n unified_storage.schedule_storage if unified_storage else instance_ref.schedule_storage\n )\n\n return klass(\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=run_storage, # type: ignore # (possible none)\n event_storage=event_storage, # type: ignore # (possible none)\n schedule_storage=schedule_storage,\n compute_log_manager=None, # lazy load\n scheduler=instance_ref.scheduler,\n run_coordinator=None, # lazy load\n run_launcher=None, # lazy load\n settings=instance_ref.settings,\n secrets_loader=instance_ref.secrets_loader,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg=(\n "\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else ""\n ),\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n def _info(self, component: object) -> Union[str, Mapping[Any, Any]]:\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name: str, component: object) -> str:\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self) -> Mapping[str, object]:\n settings: Mapping[str, object] = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self.run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n def schema_str(self) -> str:\n def _schema_dict(alembic_version: "AlembicVersion") -> Optional[Mapping[str, object]]:\n if not alembic_version:\n return None\n db_revision, head_revision = alembic_version\n return {\n "current": db_revision,\n "latest": head_revision,\n }\n\n return yaml.dump(\n {\n "schema": {\n "event_log_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "run_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "schedule_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n }\n },\n default_flow_style=False,\n sort_keys=False,\n )\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n @property\n def daemon_cursor_storage(self) -> "DaemonCursorStorage":\n return self._run_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n # Lazily load in case the run coordinator requires dependencies that are not available\n # everywhere that loads the instance\n if not self._run_coordinator:\n check.invariant(\n self._ref, "Run coordinator not provided, and no instance ref available"\n )\n run_coordinator = cast(InstanceRef, self._ref).run_coordinator\n check.invariant(run_coordinator, "Run coordinator not configured in instance ref")\n self._run_coordinator = cast("RunCoordinator", run_coordinator)\n self._run_coordinator.register_instance(self)\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n # Lazily load in case the launcher requires dependencies that are not available everywhere\n # that loads the instance (e.g. The EcsRunLauncher requires boto3)\n if not self._run_launcher:\n check.invariant(self._ref, "Run launcher not provided, and no instance ref available")\n launcher = cast(InstanceRef, self._ref).run_launcher\n check.invariant(launcher, "Run launcher not configured in instance ref")\n self._run_launcher = cast("RunLauncher", launcher)\n self._run_launcher.register_instance(self)\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n if not self._compute_log_manager:\n check.invariant(\n self._ref, "Compute log manager not provided, and no instance ref available"\n )\n compute_log_manager = cast(InstanceRef, self._ref).compute_log_manager\n check.invariant(\n compute_log_manager, "Compute log manager not configured in instance ref"\n )\n self._compute_log_manager = cast("ComputeLogManager", compute_log_manager)\n self._compute_log_manager.register_instance(self)\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n else:\n return dagster_telemetry_enabled_default\n\n @property\n def nux_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n nux_enabled_by_default = True\n\n nux_settings = self.get_settings("nux")\n if not nux_settings:\n return nux_enabled_by_default\n\n if "enabled" in nux_settings:\n return nux_settings["enabled"]\n else:\n return nux_enabled_by_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Any:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def run_monitoring_cancel_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("cancel_timeout_seconds", 180)\n\n @property\n def code_server_settings(self) -> Any:\n return self.get_settings("code_servers")\n\n @property\n def code_server_process_startup_timeout(self) -> int:\n return self.code_server_settings.get(\n "local_startup_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def code_server_reload_timeout(self) -> int:\n return self.code_server_settings.get(\n "reload_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def wait_for_local_code_server_processes_on_shutdown(self) -> bool:\n return self.code_server_settings.get("wait_for_local_processes_on_shutdown", False)\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n return self.run_monitoring_settings.get("max_resume_run_attempts", 0)\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n @property\n def run_retries_enabled(self) -> bool:\n return self.get_settings("run_retries").get("enabled", False)\n\n @property\n def run_retries_max_retries(self) -> int:\n return self.get_settings("run_retries").get("max_retries")\n\n @property\n def auto_materialize_enabled(self) -> bool:\n return self.get_settings("auto_materialize").get("enabled", True)\n\n @property\n def auto_materialize_minimum_interval_seconds(self) -> int:\n return self.get_settings("auto_materialize").get("minimum_interval_seconds")\n\n @property\n def auto_materialize_run_tags(self) -> Dict[str, str]:\n return self.get_settings("auto_materialize").get("run_tags", {})\n\n @property\n def auto_materialize_respect_materialization_data_versions(self) -> bool:\n return self.get_settings("auto_materialize").get(\n "respect_materialization_data_versions", False\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> Sequence[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n loggers: Sequence[str] = python_log_settings.get("managed_python_loggers", [])\n return loggers\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn: Optional[PrintFn] = None) -> None:\n from dagster._core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade() # type: ignore # (unknown method on run storage)\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade() # type: ignore # (possible none)\n self._schedule_storage.migrate(print_fn) # type: ignore # (possible none)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n if self._schedule_storage:\n self._schedule_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._run_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._event_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n\n def reindex(self, print_fn: PrintFn = lambda _: None) -> None:\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn) # type: ignore # (possible none)\n print_fn("Done.")\n\n def dispose(self) -> None:\n self._local_artifact_storage.dispose()\n self._run_storage.dispose()\n if self._run_coordinator:\n self._run_coordinator.dispose()\n if self._run_launcher:\n self._run_launcher.dispose()\n self._event_storage.dispose()\n if self._compute_log_manager:\n self._compute_log_manager.dispose()\n if self._secrets_loader:\n self._secrets_loader.dispose()\n\n if self in DagsterInstance._TEMP_DIRS:\n DagsterInstance._TEMP_DIRS[self].cleanup()\n del DagsterInstance._TEMP_DIRS[self]\n\n # run storage\n
[docs] @public\n def get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n """Get a :py:class:`DagsterRun` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run to retrieve.\n\n Returns:\n Optional[DagsterRun]: The run corresponding to the given id. If no run matching the id\n is found, return `None`.\n """\n record = self.get_run_record_by_id(run_id)\n if record is None:\n return None\n return record.dagster_run
\n\n
[docs] @public\n @traced\n def get_run_record_by_id(self, run_id: str) -> Optional[RunRecord]:\n """Get a :py:class:`RunRecord` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run record to retrieve.\n\n Returns:\n Optional[RunRecord]: The run record corresponding to the given id. If no run matching\n the id is found, return `None`.\n """\n records = self._run_storage.get_run_records(RunsFilter(run_ids=[run_id]))\n if not records:\n return None\n return records[0]
\n\n @traced\n def get_job_snapshot(self, snapshot_id: str) -> "JobSnapshot":\n return self._run_storage.get_job_snapshot(snapshot_id)\n\n @traced\n def has_job_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_job(self, snapshot_id: str) -> "HistoricalJob":\n from dagster._core.host_representation import HistoricalJob\n\n snapshot = self._run_storage.get_job_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_job_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalJob(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_job(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> DagsterRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags(\n tag_keys=tag_keys, value_prefix=value_prefix, limit=limit\n )\n\n @traced\n def get_run_tag_keys(self) -> Sequence[str]:\n return self._run_storage.get_run_tag_keys()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_job(\n self,\n job_def: "JobDefinition",\n execution_plan: Optional["ExecutionPlan"] = None,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n status: Optional[Union[DagsterRunStatus, str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n repository_load_data: Optional["RepositoryLoadData"] = None,\n ) -> DagsterRun:\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.execution.api import create_execution_plan\n from dagster._core.execution.plan.plan import ExecutionPlan\n from dagster._core.snap import snapshot_from_execution_plan\n\n check.inst_param(job_def, "pipeline_def", JobDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that op_selection is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # op_selection is not required and will not be converted to op_selection here.\n # i.e. this function doesn't handle solid queries.\n # op_selection is only used to pass the user queries further down.\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_list_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # op_selection never provided\n if asset_selection or op_selection:\n # for cases when `create_run_for_pipeline` is directly called\n job_def = job_def.get_subset(\n asset_selection=asset_selection,\n op_selection=op_selection,\n )\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n job=job_def,\n run_config=run_config,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n return self.create_run(\n job_name=job_def.name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=None,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus(status) if status else None,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_def.get_job_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n job_def.get_job_snapshot_id(),\n ),\n parent_job_snapshot=job_def.get_parent_job_snapshot(),\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n status: Optional[DagsterRunStatus],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]] = None,\n op_selection: Optional[Sequence[str]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags = {\n **tags,\n AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat(),\n }\n\n check.invariant(\n not (not job_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot."\n " It is possible to have no execution plan snapshot since we persist runs that do"\n " not successfully compile execution plans in the scheduled case.",\n )\n\n job_snapshot_id = (\n self._ensure_persisted_job_snapshot(job_snapshot, parent_job_snapshot)\n if job_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, job_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and job_snapshot_id\n else None\n )\n\n return DagsterRun(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot_id=job_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n has_repository_load_data=execution_plan_snapshot is not None\n and execution_plan_snapshot.repository_load_data is not None,\n )\n\n def _ensure_persisted_job_snapshot(\n self,\n job_snapshot: "JobSnapshot",\n parent_job_snapshot: "Optional[JobSnapshot]",\n ) -> str:\n from dagster._core.snap import JobSnapshot, create_job_snapshot_id\n\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if job_snapshot.lineage_snapshot:\n if not self._run_storage.has_job_snapshot(\n job_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_job_snapshot_id(parent_job_snapshot) # type: ignore # (possible none)\n == job_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(\n parent_job_snapshot # type: ignore # (possible none)\n )\n check.invariant(\n job_snapshot.lineage_snapshot.parent_snapshot_id == returned_job_snapshot_id\n )\n\n job_snapshot_id = create_job_snapshot_id(job_snapshot)\n if not self._run_storage.has_job_snapshot(job_snapshot_id):\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(job_snapshot)\n check.invariant(job_snapshot_id == returned_job_snapshot_id)\n\n return job_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self,\n execution_plan_snapshot: "ExecutionPlanSnapshot",\n job_snapshot_id: str,\n step_keys_to_execute: Optional[Sequence[str]],\n ) -> str:\n from dagster._core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(job_snapshot_id, "job_snapshot_id")\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.job_snapshot_id == job_snapshot_id,\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n f'"{execution_plan_snapshot.job_snapshot_id}" and snapshot_id created in memory is '\n f'"{job_snapshot_id}"',\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_planned_events(\n self, dagster_run: DagsterRun, execution_plan_snapshot: "ExecutionPlanSnapshot"\n ) -> None:\n from dagster._core.events import (\n AssetMaterializationPlannedData,\n DagsterEvent,\n DagsterEventType,\n )\n\n job_name = dagster_run.job_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = check.not_none(output.properties).asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n partition_tag = dagster_run.tags.get(PARTITION_NAME_TAG)\n partition_range_start, partition_range_end = dagster_run.tags.get(\n ASSET_PARTITION_RANGE_START_TAG\n ), dagster_run.tags.get(ASSET_PARTITION_RANGE_END_TAG)\n\n if partition_tag and (partition_range_start or partition_range_end):\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set along with"\n f" {PARTITION_NAME_TAG}"\n )\n\n if partition_range_start or partition_range_end:\n if not partition_range_start or not partition_range_end:\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set without the other"\n )\n\n # TODO: resolve which partitions are in the range, and emit an event for each\n\n partition = (\n partition_tag\n if check.not_none(output.properties).is_asset_partitioned\n else None\n )\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to materialize asset {asset_key.to_string()}"\n ),\n event_specific_data=AssetMaterializationPlannedData(\n asset_key, partition=partition\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n if check.not_none(output.properties).asset_check_key:\n asset_check_key = check.not_none(\n check.not_none(output.properties).asset_check_key\n )\n target_asset_key = asset_check_key.asset_key\n check_name = asset_check_key.name\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to execute asset check {check_name} on"\n f" asset {target_asset_key.to_string()}"\n ),\n event_specific_data=AssetCheckEvaluationPlanned(\n target_asset_key,\n check_name=check_name,\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n def create_run(\n self,\n *,\n job_name: str,\n run_id: Optional[str],\n run_config: Optional[Mapping[str, object]],\n status: Optional[DagsterRunStatus],\n tags: Optional[Mapping[str, Any]],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n step_keys_to_execute: Optional[Sequence[str]],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n job_snapshot: Optional["JobSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]],\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]],\n resolved_op_selection: Optional[AbstractSet[str]],\n op_selection: Optional[Sequence[str]],\n external_job_origin: Optional["ExternalJobOrigin"],\n job_code_origin: Optional[JobPythonOrigin],\n ) -> DagsterRun:\n from dagster._core.definitions.asset_check_spec import AssetCheckKey\n from dagster._core.definitions.utils import validate_tags\n from dagster._core.host_representation.origin import ExternalJobOrigin\n from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n\n check.str_param(job_name, "job_name")\n check.opt_str_param(\n run_id, "run_id"\n ) # will be assigned to make_new_run_id() lower in callstack\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.opt_inst_param(status, "status", DagsterRunStatus)\n check.opt_mapping_param(tags, "tags", key_type=str)\n\n validated_tags = validate_tags(tags)\n\n check.opt_str_param(root_run_id, "root_run_id")\n check.opt_str_param(parent_run_id, "parent_run_id")\n\n # If step_keys_to_execute is None, then everything is executed. In some cases callers\n # are still exploding and sending the full list of step keys even though that is\n # unnecessary.\n\n check.opt_sequence_param(step_keys_to_execute, "step_keys_to_execute")\n check.opt_inst_param(\n execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n )\n\n if root_run_id or parent_run_id:\n check.invariant(\n root_run_id and parent_run_id,\n "If root_run_id or parent_run_id is passed, this is a re-execution scenario and"\n " root_run_id and parent_run_id must both be passed.",\n )\n\n # The job_snapshot should always be set in production scenarios. In tests\n # we have sometimes omitted it out of convenience.\n\n check.opt_inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if parent_job_snapshot:\n check.invariant(\n job_snapshot,\n "If parent_job_snapshot is set, job_snapshot should also be.",\n )\n\n # op_selection is a sequence of selection queries assigned by the user.\n # *Most* callers expand the op_selection into an explicit set of\n # resolved_op_selection via accessing external_job.resolved_op_selection\n # but not all do. Some (launch execution mutation in graphql and backfill run\n # creation, for example) actually pass the solid *selection* into the\n # resolved_op_selection parameter, but just as a frozen set, rather than\n # fully resolving the selection, as the daemon launchers do. Given the\n # state of callers we just check to ensure that the arguments are well-formed.\n #\n # asset_selection adds another dimension to this lovely dance. op_selection\n # and asset_selection are mutually exclusive and should never both be set.\n # This is invariant is checked in a sporadic fashion around\n # the codebase, but is never enforced in a typed fashion.\n #\n # Additionally, the way that callsites currently behave *if* asset selection\n # is set (i.e., not None) then *neither* op_selection *nor*\n # resolved_op_selection is passed. In the asset selection case resolving\n # the set of assets into the canonical resolved_op_selection is done in\n # the user process, and the exact resolution is never persisted in the run.\n # We are asserting that invariant here to maintain that behavior.\n #\n # Finally, asset_check_selection can be passed along with asset_selection. It\n # is mutually exclusive with op_selection and resolved_op_selection. A `None`\n # value will include any asset checks that target selected assets. An empty set\n # will include no asset checks.\n\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", of_type=AssetCheckKey)\n\n if asset_selection is not None or asset_check_selection is not None:\n check.invariant(\n op_selection is None,\n "Cannot pass op_selection with either of asset_selection or asset_check_selection",\n )\n\n check.invariant(\n resolved_op_selection is None,\n "Cannot pass resolved_op_selection with either of asset_selection or"\n " asset_check_selection",\n )\n\n # The "python origin" arguments exist so a job can be reconstructed in memory\n # after a DagsterRun has been fetched from the database.\n #\n # There are cases (notably in _logged_execute_job with Reconstructable jobs)\n # where job_code_origin and is not. In some cloud test cases only\n # external_job_origin is passed But they are almost always passed together.\n # If these are not set the created run will never be able to be relaunched from\n # the information just in the run or in another process.\n\n check.opt_inst_param(external_job_origin, "external_job_origin", ExternalJobOrigin)\n check.opt_inst_param(job_code_origin, "job_code_origin", JobPythonOrigin)\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id, # type: ignore # (possible none)\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=validated_tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n dagster_run = self._run_storage.add_run(dagster_run)\n\n if execution_plan_snapshot:\n self._log_asset_planned_events(dagster_run, execution_plan_snapshot)\n\n return dagster_run\n\n def create_reexecuted_run(\n self,\n *,\n parent_run: DagsterRun,\n code_location: "CodeLocation",\n external_job: "ExternalJob",\n strategy: "ReexecutionStrategy",\n extra_tags: Optional[Mapping[str, Any]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n use_parent_run_tags: bool = False,\n ) -> DagsterRun:\n from dagster._core.execution.plan.resume_retry import (\n ReexecutionStrategy,\n )\n from dagster._core.execution.plan.state import KnownExecutionState\n from dagster._core.host_representation import CodeLocation, ExternalJob\n\n check.inst_param(parent_run, "parent_run", DagsterRun)\n check.inst_param(code_location, "code_location", CodeLocation)\n check.inst_param(external_job, "external_job", ExternalJob)\n check.inst_param(strategy, "strategy", ReexecutionStrategy)\n check.opt_mapping_param(extra_tags, "extra_tags", key_type=str)\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.bool_param(use_parent_run_tags, "use_parent_run_tags")\n\n root_run_id = parent_run.root_run_id or parent_run.run_id\n parent_run_id = parent_run.run_id\n\n tags = merge_dicts(\n external_job.tags,\n (\n # these can differ from external_job.tags if tags were added at launch time\n parent_run.tags\n if use_parent_run_tags\n else {}\n ),\n extra_tags or {},\n {\n PARENT_RUN_ID_TAG: parent_run_id,\n ROOT_RUN_ID_TAG: root_run_id,\n },\n )\n\n run_config = run_config if run_config is not None else parent_run.run_config\n\n if strategy == ReexecutionStrategy.FROM_FAILURE:\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n\n (\n step_keys_to_execute,\n known_state,\n ) = KnownExecutionState.build_resume_retry_reexecution(\n self,\n parent_run=parent_run,\n )\n tags[RESUME_RETRY_TAG] = "true"\n elif strategy == ReexecutionStrategy.ALL_STEPS:\n step_keys_to_execute = None\n known_state = None\n else:\n raise DagsterInvariantViolationError(f"Unknown reexecution strategy: {strategy}")\n\n external_execution_plan = code_location.get_external_execution_plan(\n external_job,\n run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance=self,\n )\n\n return self.create_run(\n job_name=parent_run.job_name,\n run_id=None,\n run_config=run_config,\n resolved_op_selection=parent_run.resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.NOT_STARTED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=external_job.job_snapshot,\n execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,\n parent_job_snapshot=external_job.parent_job_snapshot,\n op_selection=parent_run.op_selection,\n asset_selection=parent_run.asset_selection,\n asset_check_selection=parent_run.asset_check_selection,\n external_job_origin=external_job.get_external_origin(),\n job_code_origin=external_job.get_python_origin(),\n )\n\n def register_managed_run(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n op_selection: Optional[Sequence[str]] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # DagsterRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(dagster_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n job_code_origin=job_code_origin,\n )\n\n def get_run() -> DagsterRun:\n candidate_run = self.get_run_by_id(dagster_run.run_id)\n\n field_diff = _check_run_equality(dagster_run, candidate_run) # type: ignore # (possible none)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=dagster_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run # type: ignore # (possible none)\n\n if self.has_run(dagster_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(dagster_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n return self._run_storage.add_run(dagster_run)\n\n @traced\n def add_snapshot(\n self,\n snapshot: Union["JobSnapshot", "ExecutionPlanSnapshot"],\n snapshot_id: Optional[str] = None,\n ) -> None:\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent") -> None:\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n return self._run_storage.get_run_ids(filters, cursor=cursor, limit=limit)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n
[docs] @public\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )
\n\n @traced\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n return self._run_storage.get_run_partition_data(runs_filter)\n\n def wipe(self) -> None:\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n
[docs] @public\n @traced\n def delete_run(self, run_id: str) -> None:\n """Delete a run and all events generated by that from storage.\n\n Args:\n run_id (str): The id of the run to delete.\n """\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)
\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id: str,\n cursor: Optional[int] = None,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self,\n run_id: str,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n @traced\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> "EventLogConnection":\n return self._event_storage.get_records_for_run(run_id, cursor, of_type, limit, ascending)\n\n def watch_event_logs(self, run_id: str, cursor: Optional[str], cb: "EventHandlerFn") -> None:\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id: str, cb: "EventHandlerFn") -> None:\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def can_cache_asset_status_data(self) -> bool:\n return self._event_storage.can_cache_asset_status_data()\n\n @traced\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n self._event_storage.update_asset_cached_status_data(asset_key, cache_values)\n\n @traced\n def wipe_asset_cached_status(self, asset_keys: Sequence[AssetKey]) -> None:\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset_cached_status(asset_key)\n\n @traced\n def all_asset_keys(self) -> Sequence[AssetKey]:\n return self._event_storage.all_asset_keys()\n\n
[docs] @public\n @traced\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n """Return a filtered subset of asset keys managed by this instance.\n\n Args:\n prefix (Optional[Sequence[str]]): Return only assets having this key prefix.\n limit (Optional[int]): Maximum number of keys to return.\n cursor (Optional[str]): Cursor to use for pagination.\n\n Returns:\n Sequence[AssetKey]: List of asset keys.\n """\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)
\n\n
[docs] @public\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n """Return true if this instance manages the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to check.\n """\n return self._event_storage.has_asset_key(asset_key)
\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n
[docs] @public\n @traced\n def get_latest_materialization_event(self, asset_key: AssetKey) -> Optional["EventLogEntry"]:\n """Fetch the latest materialization event for the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to return materialization for.\n\n Returns:\n Optional[AssetMaterialization]: The latest materialization event for the given asset\n key, or `None` if the asset has not been materialized.\n """\n return self._event_storage.get_latest_materialization_events([asset_key]).get(asset_key)
\n\n
[docs] @public\n @traced\n def get_event_records(\n self,\n event_records_filter: "EventRecordsFilter",\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)
\n\n
[docs] @public\n @traced\n def get_status_by_partition(\n self,\n asset_key: AssetKey,\n partition_keys: Sequence[str],\n partitions_def: "PartitionsDefinition",\n ) -> Optional[Mapping[str, "AssetPartitionStatus"]]:\n """Get the current status of provided partition_keys for the provided asset.\n\n Args:\n asset_key (AssetKey): The asset to get per-partition status for.\n partition_keys (Sequence[str]): The partitions to get status for.\n partitions_def (PartitionsDefinition): The PartitionsDefinition of the asset to get\n per-partition status for.\n\n Returns:\n Optional[Mapping[str, AssetPartitionStatus]]: status for each partition key\n\n """\n from dagster._core.storage.partition_status_cache import (\n AssetPartitionStatus,\n AssetStatusCacheValue,\n get_and_update_asset_status_cache_value,\n )\n\n cached_value = get_and_update_asset_status_cache_value(self, asset_key, partitions_def)\n\n if isinstance(cached_value, AssetStatusCacheValue):\n materialized_partitions = cached_value.deserialize_materialized_partition_subsets(\n partitions_def\n )\n failed_partitions = cached_value.deserialize_failed_partition_subsets(partitions_def)\n in_progress_partitions = cached_value.deserialize_in_progress_partition_subsets(\n partitions_def\n )\n\n status_by_partition = {}\n\n for partition_key in partition_keys:\n if partition_key in in_progress_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.IN_PROGRESS\n elif partition_key in failed_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.FAILED\n elif partition_key in materialized_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.MATERIALIZED\n else:\n status_by_partition[partition_key] = None\n\n return status_by_partition
\n\n
[docs] @public\n @traced\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence["AssetRecord"]:\n """Return an `AssetRecord` for each of the given asset keys.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): List of asset keys to retrieve records for.\n\n Returns:\n Sequence[AssetRecord]: List of asset records.\n """\n return self._event_storage.get_asset_records(asset_keys)
\n\n @traced\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, searches for the event with the provided event_id.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n return self._event_storage.get_event_tags_for_asset(asset_key, filter_tags, filter_event_id)\n\n
[docs] @public\n @traced\n def wipe_assets(self, asset_keys: Sequence[AssetKey]) -> None:\n """Wipes asset event history from the event log for the given asset keys.\n\n Args:\n asset_keys (Sequence[AssetKey]): Asset keys to wipe.\n """\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)
\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys, after_cursor)\n\n @traced\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n return self._event_storage.get_materialized_partitions(\n asset_key, before_cursor=before_cursor, after_cursor=after_cursor\n )\n\n @traced\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: "DagsterEventType"\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n return self._event_storage.get_latest_storage_id_by_partition(asset_key, event_type)\n\n
[docs] @public\n @traced\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the set of partition keys for the specified :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n return self._event_storage.get_dynamic_partitions(partitions_def_name)
\n\n
[docs] @public\n @traced\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add partitions to the specified :py:class:`DynamicPartitionsDefinition` idempotently.\n Does not add any partitions that already exist.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_keys (Sequence[str]): Partition keys to add.\n """\n from dagster._core.definitions.partition import (\n raise_error_on_invalid_partition_key_substring,\n )\n\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n if isinstance(partition_keys, str):\n # Guard against a single string being passed in `partition_keys`\n raise DagsterInvalidInvocationError("partition_keys must be a sequence of strings")\n raise_error_on_invalid_partition_key_substring(partition_keys)\n return self._event_storage.add_dynamic_partitions(partitions_def_name, partition_keys)
\n\n
[docs] @public\n @traced\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified :py:class:`DynamicPartitionsDefinition`.\n If the partition does not exist, exits silently.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to delete.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_key, "partition_key", of_type=str)\n self._event_storage.delete_dynamic_partition(partitions_def_name, partition_key)
\n\n
[docs] @public\n @traced\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a partition key exists for the :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to check.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.str_param(partition_key, "partition_key")\n return self._event_storage.has_dynamic_partition(partitions_def_name, partition_key)
\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self) -> Sequence[logging.Handler]:\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self) -> _EventListenerLogHandler:\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self) -> Sequence[logging.Handler]:\n handlers: List[logging.Handler] = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event: "EventLogEntry") -> None:\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event: "EventLogEntry") -> None:\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.get_dagster_event().is_job_event:\n self._run_storage.handle_run_event(run_id, event.get_dagster_event())\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id: str, cb) -> None:\n self._subscribers[run_id].append(cb)\n\n def report_engine_event(\n self,\n message: str,\n dagster_run: Optional[DagsterRun] = None,\n engine_event_data: Optional["EngineEventData"] = None,\n cls: Optional[Type[object]] = None,\n step_key: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n ) -> "DagsterEvent":\n """Report a EngineEvent that occurred outside of a job execution context."""\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(job_name, "job_name")\n\n check.invariant(\n dagster_run or (job_name and run_id),\n "Must include either dagster_run or job_name and run_id",\n )\n\n run_id = run_id if run_id else dagster_run.run_id # type: ignore\n job_name = job_name if job_name else dagster_run.job_name # type: ignore\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData({}),\n )\n\n if cls:\n message = f"[{cls.__name__}] {message}"\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=job_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n self.report_dagster_event(dagster_event, run_id=run_id, log_level=log_level)\n return dagster_event\n\n def report_dagster_event(\n self,\n dagster_event: "DagsterEvent",\n run_id: str,\n log_level: Union[str, int] = logging.INFO,\n ) -> None:\n """Takes a DagsterEvent and stores it in persistent storage for the corresponding DagsterRun."""\n from dagster._core.events.log import EventLogEntry\n\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n job_name=dagster_event.job_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=dagster_event.step_key,\n dagster_event=dagster_event,\n )\n self.handle_new_event(event_record)\n\n def report_run_canceling(self, run: DagsterRun, message: Optional[str] = None):\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(run, "run", DagsterRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n job_name=run.job_name,\n message=message,\n )\n self.report_dagster_event(canceling_event, run_id=run.run_id)\n\n def report_run_canceled(\n self,\n dagster_run: DagsterRun,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n def report_run_failed(\n self, dagster_run: DagsterRun, message: Optional[str] = None\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id: str) -> str:\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self) -> str:\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self) -> str:\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n def submit_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n from dagster._core.host_representation import ExternalJobOrigin\n from dagster._core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_job_origin,\n ExternalJobOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.job_code_origin,\n JobPythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self.run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster._core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run\n\n # Run launcher\n\n def launch_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster._core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n job_name=run.job_name,\n )\n self.report_dagster_event(launch_started_event, run_id=run.run_id)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self.run_launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int) -> DagsterRun:\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import EngineEventData\n from dagster._core.launcher import ResumeRunContext\n from dagster._daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self.run_launcher.resume_run(\n ResumeRunContext(\n dagster_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def count_resume_run_attempts(self, run_id: str) -> int:\n from dagster._daemon.monitoring import count_resume_run_attempts\n\n return count_resume_run_attempts(self, run_id)\n\n def run_will_resume(self, run_id: str) -> bool:\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule: "ExternalSchedule") -> "InstigatorState":\n return self._scheduler.start_schedule(self, external_schedule) # type: ignore\n\n def stop_schedule(\n self,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional["ExternalSchedule"],\n ) -> "InstigatorState":\n return self._scheduler.stop_schedule( # type: ignore\n self, schedule_origin_id, schedule_selector_id, external_schedule\n )\n\n def scheduler_debug_info(self) -> "SchedulerDebugInfo":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules: List[str] = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info: Mapping[str, Mapping[str, object]] = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(), # type: ignore\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor: "ExternalSensor") -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n if not stored_state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(\n self,\n instigator_origin_id: str,\n selector_id: str,\n external_sensor: Optional["ExternalSensor"],\n ) -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(instigator_origin_id, selector_id)\n computed_state: InstigatorState\n if external_sensor:\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n else:\n computed_state = check.not_none(stored_state)\n\n if not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_sensor\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional["InstigatorType"] = None,\n instigator_statuses: Optional[Set["InstigatorStatus"]] = None,\n ):\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type, instigator_statuses\n )\n\n @traced\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional["InstigatorState"]:\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id) # type: ignore # (possible none)\n\n @property\n def supports_batch_tick_queries(self) -> bool:\n return self._schedule_storage and self._schedule_storage.supports_batch_queries # type: ignore # (possible none)\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Sequence["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(\n self, origin_id: str, selector_id: str, timestamp: float\n ) -> Optional["InstigatorTick"]:\n matches = self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Sequence["InstigatorTick"]:\n return self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data: "TickData") -> "InstigatorTick":\n return check.not_none(self._schedule_storage).create_tick(tick_data)\n\n def update_tick(self, tick: "InstigatorTick"):\n return check.not_none(self._schedule_storage).update_tick(tick)\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> None:\n self._schedule_storage.purge_ticks(origin_id, selector_id, before, tick_statuses) # type: ignore # (possible none)\n\n def wipe_all_schedules(self) -> None:\n if self._scheduler:\n self._scheduler.wipe(self) # type: ignore # (possible none)\n\n self._schedule_storage.wipe() # type: ignore # (possible none)\n\n def logs_path_for_schedule(self, schedule_origin_id: str) -> str:\n return self._scheduler.get_logs_path(self, schedule_origin_id) # type: ignore # (possible none)\n\n def __enter__(self) -> Self:\n return self\n\n def __exit__(\n self,\n exception_type: Optional[Type[BaseException]],\n exception_value: Optional[BaseException],\n traceback: Optional[TracebackType],\n ) -> None:\n self.dispose()\n\n # dagster daemon\n def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat") -> None:\n """Called on a regular interval by the daemon."""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)\n\n def get_daemon_heartbeats(self) -> Mapping[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types."""\n return self._run_storage.get_daemon_heartbeats()\n\n def wipe_daemon_heartbeats(self) -> None:\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self) -> Sequence[str]:\n from dagster._core.run_coordinator import QueuedRunCoordinator\n from dagster._core.scheduler import DagsterDaemonScheduler\n from dagster._daemon.asset_daemon import AssetDaemon\n from dagster._daemon.auto_run_reexecution.event_log_consumer import EventLogConsumerDaemon\n from dagster._daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster._daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n if self.run_retries_enabled:\n daemons.append(EventLogConsumerDaemon.daemon_type())\n if self.auto_materialize_enabled:\n daemons.append(AssetDaemon.daemon_type())\n return daemons\n\n def get_daemon_statuses(\n self, daemon_types: Optional[Sequence[str]] = None\n ) -> Mapping[str, "DaemonStatus"]:\n """Get the current status of the daemons. If daemon_types aren't provided, defaults to all\n required types. Returns a dict of daemon type to status.\n """\n from dagster._daemon.controller import get_daemon_statuses\n\n check.opt_sequence_param(daemon_types, "daemon_types", of_type=str)\n return get_daemon_statuses(\n self, daemon_types=daemon_types or self.get_required_daemon_types(), ignore_errors=True\n )\n\n @property\n def daemon_skip_heartbeats_without_errors(self) -> bool:\n # If enabled, daemon threads won't write heartbeats unless they encounter an error. This is\n # enabled in cloud, where we don't need to use heartbeats to check if daemons are running, but\n # do need to surface errors to users. This is an optimization to reduce DB writes.\n return False\n\n # backfill\n def get_backfills(\n self,\n status: Optional["BulkActionStatus"] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence["PartitionBackfill"]:\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id: str) -> Optional["PartitionBackfill"]:\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """Gate on an experimental feature to start a thread that monitors for if the run should be canceled."""\n return False\n\n def get_tick_retention_settings(\n self, instigator_type: "InstigatorType"\n ) -> Mapping["TickStatus", int]:\n from dagster._core.definitions.run_request import InstigatorType\n\n retention_settings = self.get_settings("retention")\n\n if instigator_type == InstigatorType.SCHEDULE:\n tick_settings = retention_settings.get("schedule")\n elif instigator_type == InstigatorType.SENSOR:\n tick_settings = retention_settings.get("sensor")\n elif instigator_type == InstigatorType.AUTO_MATERIALIZE:\n tick_settings = retention_settings.get("auto_materialize")\n else:\n raise Exception(f"Unexpected instigator type {instigator_type}")\n\n default_tick_settings = get_default_tick_retention_settings(instigator_type)\n return get_tick_retention_settings(tick_settings, default_tick_settings)\n\n def inject_env_vars(self, location_name: Optional[str]) -> None:\n if not self._secrets_loader:\n return\n\n new_env = self._secrets_loader.get_secrets_for_environment(location_name)\n for k, v in new_env.items():\n os.environ[k] = v\n\n def get_latest_data_version_record(\n self,\n key: AssetKey,\n is_source: Optional[bool] = None,\n partition_key: Optional[str] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Optional["EventLogRecord"]:\n from dagster._core.event_api import EventRecordsFilter\n from dagster._core.events import DagsterEventType\n\n # When we cant don't know whether the requested key corresponds to a source or regular\n # asset, we need to retrieve both the latest observation and materialization for all assets.\n # If there is a materialization, it's a regular asset and we can ignore the observation.\n\n observation: Optional[EventLogRecord] = None\n if is_source or is_source is None:\n observations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n observation = next(iter(observations), None)\n\n materialization: Optional[EventLogRecord] = None\n if not is_source:\n materializations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n materialization = next(iter(materializations), None)\n\n return materialization or observation\n\n
[docs] @public\n def get_latest_materialization_code_versions(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[str]]:\n """Returns the code version used for the latest materialization of each of the provided\n assets.\n\n Args:\n asset_keys (Iterable[AssetKey]): The asset keys to find latest materialization code\n versions for.\n\n Returns:\n Mapping[AssetKey, Optional[str]]: A dictionary with a key for each of the provided asset\n keys. The values will be None if the asset has no materializations. If an asset does\n not have a code version explicitly assigned to its definitions, but was\n materialized, Dagster assigns the run ID as its code version.\n """\n result: Dict[AssetKey, Optional[str]] = {}\n latest_materialization_events = self.get_latest_materialization_events(asset_keys)\n for asset_key in asset_keys:\n event_log_entry = latest_materialization_events.get(asset_key)\n if event_log_entry is None:\n result[asset_key] = None\n else:\n data_provenance = extract_data_provenance_from_entry(event_log_entry)\n result[asset_key] = data_provenance.code_version if data_provenance else None\n\n return result
\n\n @experimental\n def report_runless_asset_event(\n self,\n asset_event: Union["AssetMaterialization", "AssetObservation", "AssetCheckEvaluation"],\n ):\n """Record an event log entry related to assets that does not belong to a Dagster run."""\n from dagster._core.events import (\n AssetMaterialization,\n AssetObservationData,\n DagsterEvent,\n DagsterEventType,\n StepMaterializationData,\n )\n\n if isinstance(asset_event, AssetMaterialization):\n event_type_value = DagsterEventType.ASSET_MATERIALIZATION.value\n data_payload = StepMaterializationData(asset_event)\n elif isinstance(asset_event, AssetCheckEvaluation):\n event_type_value = DagsterEventType.ASSET_CHECK_EVALUATION.value\n data_payload = asset_event\n elif isinstance(asset_event, AssetObservation):\n event_type_value = DagsterEventType.ASSET_OBSERVATION.value\n data_payload = AssetObservationData(asset_event)\n else:\n raise DagsterInvariantViolationError(\n f"Received unexpected asset event type {asset_event}, expected"\n " AssetMaterialization, AssetObservation or AssetCheckEvaluation"\n )\n\n return self.report_dagster_event(\n run_id=RUNLESS_RUN_ID,\n dagster_event=DagsterEvent(\n event_type_value=event_type_value,\n event_specific_data=data_payload,\n job_name=RUNLESS_JOB_NAME,\n ),\n )\n\n def get_asset_check_support(self) -> "AssetCheckInstanceSupport":\n from dagster._core.storage.asset_check_execution_record import AssetCheckInstanceSupport\n\n return (\n AssetCheckInstanceSupport.SUPPORTED\n if self.event_log_storage.supports_asset_checks\n else AssetCheckInstanceSupport.NEEDS_MIGRATION\n )
\n
", "current_page_name": "_modules/dagster/_core/instance", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance.ref

\nimport os\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Type\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster._serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance, DagsterInstanceOverrides\n    from dagster._core.launcher.base import RunLauncher\n    from dagster._core.run_coordinator.base import RunCoordinator\n    from dagster._core.scheduler.scheduler import Scheduler\n    from dagster._core.secrets.loader import SecretsLoader\n    from dagster._core.storage.base_storage import DagsterStorage\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.event_log.base import EventLogStorage\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs.base import RunStorage\n    from dagster._core.storage.schedules.base import ScheduleStorage\n\n\ndef compute_logs_directory(base: str) -> str:\n    return os.path.join(base, "storage")\n\n\ndef _runs_directory(base: str) -> str:\n    return os.path.join(base, "history", "")\n\n\ndef _event_logs_directory(base: str) -> str:\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base: str) -> str:\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field: Mapping[str, Any]) -> ConfigurableClassData:\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(\n    config_value: Mapping[str, Any], field_name: str, default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\ndef configurable_secrets_loader_data(\n    config_field: Mapping[str, Any], default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    if not config_field:\n        return default\n    elif "custom" in config_field:\n        return configurable_class_data(config_field["custom"])\n    else:\n        return None\n\n\ndef configurable_storage_data(\n    config_field: Mapping[str, Any], defaults: Mapping[str, Optional[ConfigurableClassData]]\n) -> Sequence[Optional[ConfigurableClassData]]:\n    storage_data: ConfigurableClassData\n    run_storage_data: Optional[ConfigurableClassData]\n    event_storage_data: Optional[ConfigurableClassData]\n    schedule_storage_data: Optional[ConfigurableClassData]\n\n    if not config_field:\n        storage_data = check.not_none(defaults.get("storage"))\n        run_storage_data = check.not_none(defaults.get("run_storage"))\n        event_storage_data = check.not_none(defaults.get("event_log_storage"))\n        schedule_storage_data = check.not_none(defaults.get("schedule_storage"))\n    elif "postgres" in config_field:\n        config_yaml = yaml.dump(config_field["postgres"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="DagsterPostgresStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "mysql" in config_field:\n        config_yaml = yaml.dump(config_field["mysql"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="DagsterMySQLStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "sqlite" in config_field:\n        base_dir = config_field["sqlite"]["base_dir"]\n        storage_data = ConfigurableClassData(\n            "dagster._core.storage.sqlite_storage",\n            "DagsterSqliteStorage",\n            yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n        )\n\n        # Back-compat fo the legacy storage field only works if the base_dir is a string\n        # (env var doesn't work since each storage has a different value for the base_dir field)\n        if isinstance(base_dir, str):\n            run_storage_data = ConfigurableClassData(\n                "dagster._core.storage.runs",\n                "SqliteRunStorage",\n                yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            event_storage_data = ConfigurableClassData(\n                "dagster._core.storage.event_log",\n                "SqliteEventLogStorage",\n                yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            schedule_storage_data = ConfigurableClassData(\n                "dagster._core.storage.schedules",\n                "SqliteScheduleStorage",\n                yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n            )\n        else:\n            run_storage_data = None\n            event_storage_data = None\n            schedule_storage_data = None\n    else:\n        storage_data = configurable_class_data(config_field["custom"])\n        storage_config_yaml = yaml.dump(\n            {\n                "module_name": storage_data.module_name,\n                "class_name": storage_data.class_name,\n                "config_yaml": storage_data.config_yaml,\n            },\n            default_flow_style=False,\n        )\n        run_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyRunStorage", storage_config_yaml\n        )\n        event_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyEventLogStorage", storage_config_yaml\n        )\n        schedule_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyScheduleStorage", storage_config_yaml\n        )\n\n    return [storage_data, run_storage_data, event_storage_data, schedule_storage_data]\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Mapping[str, object]),\n # Required for backwards compatibility, but going forward will be unused by new versions\n # of DagsterInstance, which instead will instead grab the constituent storages from the\n # unified `storage_data`, if it is populated.\n ("run_storage_data", Optional[ConfigurableClassData]),\n ("event_storage_data", Optional[ConfigurableClassData]),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n # unified storage field\n ("storage_data", Optional[ConfigurableClassData]),\n ("secrets_loader_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Mapping[str, object],\n run_storage_data: Optional[ConfigurableClassData],\n event_storage_data: Optional[ConfigurableClassData],\n schedule_storage_data: Optional[ConfigurableClassData],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n storage_data: Optional[ConfigurableClassData] = None,\n secrets_loader_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_mapping_param(settings, "settings", key_type=str),\n run_storage_data=check.opt_inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.opt_inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n storage_data=check.opt_inst_param(storage_data, "storage_data", ConfigurableClassData),\n secrets_loader_data=check.opt_inst_param(\n secrets_loader_data, "secrets_loader_data", ConfigurableClassData\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir: str) -> Mapping[str, Optional[ConfigurableClassData]]:\n default_run_storage_data = ConfigurableClassData(\n "dagster._core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n )\n default_event_log_storage_data = ConfigurableClassData(\n "dagster._core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n )\n default_schedule_storage_data = ConfigurableClassData(\n "dagster._core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n )\n\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster._core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "storage": ConfigurableClassData(\n "dagster._core.storage.sqlite_storage",\n "DagsterSqliteStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster._core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster._core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster._core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n # For back-compat, the default is actually set in the secrets_loader property above,\n # so that old clients loading new config don't try to load a class that they\n # don't recognize\n "secrets": None,\n # LEGACY DEFAULTS\n "run_storage": default_run_storage_data,\n "event_log_storage": default_event_log_storage_data,\n "schedule_storage": default_schedule_storage_data,\n }\n\n @staticmethod\n def from_dir(\n base_dir: str,\n *,\n config_dir: Optional[str] = None,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n overrides: Optional["DagsterInstanceOverrides"] = None,\n ) -> "InstanceRef":\n if config_dir is None:\n config_dir = base_dir\n\n overrides = check.opt_mapping_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n config_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys()) # type: ignore # (undefined method)\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir) # type: ignore # (undefined method)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n if (\n config_value.get("run_storage")\n or config_value.get("event_log_storage")\n or config_value.get("schedule_storage")\n ):\n # using legacy config, specifying config for each of the constituent storages, make sure\n # to create a composite storage\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n storage_data = ConfigurableClassData(\n module_name="dagster._core.storage.legacy_storage",\n class_name="CompositeStorage",\n config_yaml=yaml.dump(\n {\n "run_storage": {\n "module_name": run_storage_data.module_name, # type: ignore # (possible none)\n "class_name": run_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": run_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "event_log_storage": {\n "module_name": event_storage_data.module_name, # type: ignore # (possible none)\n "class_name": event_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": event_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "schedule_storage": {\n "module_name": schedule_storage_data.module_name, # type: ignore # (possible none)\n "class_name": schedule_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": schedule_storage_data.config_yaml, # type: ignore # (possible none)\n },\n },\n default_flow_style=False,\n ),\n )\n\n else:\n [\n storage_data,\n run_storage_data,\n event_storage_data,\n schedule_storage_data,\n ] = configurable_storage_data(\n config_value.get("storage"), defaults # type: ignore # (possible none)\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n if config_value.get("run_queue"):\n run_coordinator_data = configurable_class_data(\n {\n "module": "dagster.core.run_coordinator",\n "class": "QueuedRunCoordinator",\n "config": config_value["run_queue"],\n }\n )\n else:\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n secrets_loader_data = configurable_secrets_loader_data(\n config_value.get("secrets"), defaults["secrets"] # type: ignore # (possible none)\n )\n\n settings_keys = {\n "telemetry",\n "python_logs",\n "run_monitoring",\n "run_retries",\n "code_servers",\n "retention",\n "sensors",\n "schedules",\n "nux",\n "auto_materialize",\n }\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data, # type: ignore # (possible none)\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data, # type: ignore # (possible none)\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n storage_data=storage_data,\n secrets_loader_data=secrets_loader_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self) -> "LocalArtifactStorage":\n from dagster._core.storage.root import LocalArtifactStorage\n\n return self.local_artifact_storage_data.rehydrate(as_type=LocalArtifactStorage)\n\n @property\n def storage(self) -> Optional["DagsterStorage"]:\n from dagster._core.storage.base_storage import DagsterStorage\n\n return self.storage_data.rehydrate(as_type=DagsterStorage) if self.storage_data else None\n\n @property\n def run_storage(self) -> Optional["RunStorage"]:\n from dagster._core.storage.runs.base import RunStorage\n\n return (\n self.run_storage_data.rehydrate(as_type=RunStorage) if self.run_storage_data else None\n )\n\n @property\n def event_storage(self) -> Optional["EventLogStorage"]:\n from dagster._core.storage.event_log.base import EventLogStorage\n\n return (\n self.event_storage_data.rehydrate(as_type=EventLogStorage)\n if self.event_storage_data\n else None\n )\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n from dagster._core.storage.schedules.base import ScheduleStorage\n\n return (\n self.schedule_storage_data.rehydrate(as_type=ScheduleStorage)\n if self.schedule_storage_data\n else None\n )\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n\n return self.compute_logs_data.rehydrate(as_type=ComputeLogManager)\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n from dagster._core.scheduler.scheduler import Scheduler\n\n return self.scheduler_data.rehydrate(as_type=Scheduler) if self.scheduler_data else None\n\n @property\n def run_coordinator(self) -> Optional["RunCoordinator"]:\n from dagster._core.run_coordinator.base import RunCoordinator\n\n return (\n self.run_coordinator_data.rehydrate(as_type=RunCoordinator)\n if self.run_coordinator_data\n else None\n )\n\n @property\n def run_launcher(self) -> Optional["RunLauncher"]:\n from dagster._core.launcher.base import RunLauncher\n\n return (\n self.run_launcher_data.rehydrate(as_type=RunLauncher)\n if self.run_launcher_data\n else None\n )\n\n @property\n def secrets_loader(self) -> Optional["SecretsLoader"]:\n from dagster._core.secrets.loader import SecretsLoader\n\n # Defining a default here rather than in stored config to avoid\n # back-compat issues when loading the config on older versions where\n # EnvFileLoader was not defined\n return (\n self.secrets_loader_data.rehydrate(as_type=SecretsLoader)\n if self.secrets_loader_data\n else None\n )\n\n @property\n def custom_instance_class(self) -> Type["DagsterInstance"]:\n return ( # type: ignore # (ambiguous return type)\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self) -> Mapping[str, Any]:\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self) -> Mapping[str, Any]:\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/_core/instance/ref", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.instance"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance.ref"}, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance"}, "instance_for_test": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance_for_test

\nimport os\nimport sys\nimport tempfile\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Any, Iterator, Mapping, Optional\n\nimport yaml\n\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .._utils.env import environ\nfrom .._utils.merger import merge_dicts\nfrom .instance import DagsterInstance\n\n\n
[docs]@contextmanager\ndef instance_for_test(\n overrides: Optional[Mapping[str, Any]] = None,\n set_dagster_home: bool = True,\n temp_dir: Optional[str] = None,\n) -> Iterator[DagsterInstance]:\n """Creates a persistent :py:class:`~dagster.DagsterInstance` available within a context manager.\n\n When a context manager is opened, if no `temp_dir` parameter is set, a new\n temporary directory will be created for the duration of the context\n manager's opening. If the `set_dagster_home` parameter is set to True\n (True by default), the `$DAGSTER_HOME` environment variable will be\n overridden to be this directory (or the directory passed in by `temp_dir`)\n for the duration of the context manager being open.\n\n Args:\n overrides (Optional[Mapping[str, Any]]):\n Config to provide to instance (config format follows that typically found in an `instance.yaml` file).\n set_dagster_home (Optional[bool]):\n If set to True, the `$DAGSTER_HOME` environment variable will be\n overridden to be the directory used by this instance for the\n duration that the context manager is open. Upon the context\n manager closing, the `$DAGSTER_HOME` variable will be re-set to the original value. (Defaults to True).\n temp_dir (Optional[str]):\n The directory to use for storing local artifacts produced by the\n instance. If not set, a temporary directory will be created for\n the duration of the context manager being open, and all artifacts\n will be torn down afterward.\n """\n with ExitStack() as stack:\n if not temp_dir:\n temp_dir = stack.enter_context(tempfile.TemporaryDirectory())\n\n # wait for any grpc processes that created runs during test disposal to finish,\n # since they might also be using this instance's tempdir (and to keep each test\n # isolated / avoid race conditions in newer versions of grpcio when servers are\n # shutting down and spinning up at the same time)\n instance_overrides = merge_dicts(\n {\n "telemetry": {"enabled": False},\n "code_servers": {"wait_for_local_processes_on_shutdown": True},\n },\n (overrides if overrides else {}),\n )\n\n if set_dagster_home:\n stack.enter_context(\n environ({"DAGSTER_HOME": temp_dir, "DAGSTER_DISABLE_TELEMETRY": "yes"})\n )\n\n with open(os.path.join(temp_dir, "dagster.yaml"), "w", encoding="utf8") as fd:\n yaml.dump(instance_overrides, fd, default_flow_style=False)\n\n with DagsterInstance.from_config(temp_dir) as instance:\n try:\n yield instance\n except:\n sys.stderr.write(\n "Test raised an exception, attempting to clean up instance:"\n + serializable_error_info_from_exc_info(sys.exc_info()).to_string()\n + "\\n"\n )\n raise\n finally:\n cleanup_test_instance(instance)
\n\n\ndef cleanup_test_instance(instance: DagsterInstance) -> None:\n # To avoid filesystem contention when we close the temporary directory, wait for\n # all runs to reach a terminal state, and close any subprocesses or threads\n # that might be accessing the run history DB.\n\n # Since launcher is lazy loaded, we don't need to do anyting if it's None\n if instance._run_launcher: # noqa: SLF001\n instance._run_launcher.join() # noqa: SLF001\n
", "current_page_name": "_modules/dagster/_core/instance_for_test", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance_for_test"}, "launcher": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.workspace.workspace import IWorkspace\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """Context available within a run launcher's launch_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """Context available within a run launcher's resume_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """Result of a check_run_worker_health call."""\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n    transient: Optional[bool] = None\n    run_worker_id: Optional[str] = None  # Identifier for a particular run worker\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def terminate(self, run_id: str) -> bool:\n """Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self) -> None:\n """Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout: int = 30) -> None:\n pass\n\n @property\n def supports_check_run_worker_health(self) -> bool:\n """Whether the run launcher supports check_run_worker_health."""\n return False\n\n def check_run_worker_health(self, run: DagsterRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n return None\n\n @property\n def supports_resume_run(self) -> bool:\n """Whether the run launcher supports resume_run."""\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/_core/launcher/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.default_run_launcher

\nimport time\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, cast\n\nfrom typing_extensions import Self\n\nimport dagster._seven as seven\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterLaunchFailedError,\n    DagsterUserCodeProcessError,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import GRPC_INFO_TAG\nfrom dagster._serdes import (\n    ConfigurableClass,\n    deserialize_value,\n)\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.merger import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n    from dagster._grpc.client import DagsterGrpcClient\n\n\n# note: this class is a top level export, so we defer many imports til use for performance\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = inst_data\n\n self._run_ids = set()\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DefaultRunLauncher(inst_data=inst_data)\n\n @staticmethod\n def launch_run_from_grpc_client(\n instance: "DagsterInstance", run: DagsterRun, grpc_client: "DagsterGrpcClient"\n ):\n # defer for perf\n from dagster._grpc.types import ExecuteExternalJobArgs, StartRunResult\n\n instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": grpc_client.host},\n (\n {"port": grpc_client.port}\n if grpc_client.port\n else {"socket": grpc_client.socket}\n ),\n ({"use_ssl": True} if grpc_client.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_value(\n grpc_client.start_run(\n ExecuteExternalJobArgs(\n job_origin=run.external_job_origin, # type: ignore # (possible none)\n run_id=run.run_id,\n instance_ref=instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n # defer for perf\n from dagster._core.host_representation.code_location import (\n GrpcServerCodeLocation,\n )\n\n run = context.dagster_run\n\n check.inst_param(run, "run", DagsterRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_job_origin = check.not_none(run.external_job_origin)\n code_location = context.workspace.get_code_location(\n external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n check.inst(\n code_location,\n GrpcServerCodeLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n DefaultRunLauncher.launch_run_from_grpc_client(\n self._instance, run, cast(GrpcServerCodeLocation, code_location).client\n )\n\n self._run_ids.add(run.run_id)\n\n def _get_grpc_client_for_termination(self, run_id):\n # defer for perf\n from dagster._grpc.client import DagsterGrpcClient\n\n if not self.has_instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def terminate(self, run_id):\n # defer for perf\n from dagster._grpc.types import CancelExecutionRequest, CancelExecutionResult\n\n check.str_param(run_id, "run_id")\n if not self.has_instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n res = deserialize_value(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id)), CancelExecutionResult\n )\n\n if res.serializable_error_info:\n raise DagsterUserCodeProcessError.from_error_info(res.serializable_error_info)\n\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self.has_instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(f"Timed out waiting for these runs to finish: {active_run_ids!r}")\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2
\n
", "current_page_name": "_modules/dagster/_core/launcher/default_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nfrom typing_extensions import Protocol\n\nimport dagster._check as check\nfrom dagster._core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster._utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance\n    from dagster._core.events import DagsterEvent\n    from dagster._core.storage.dagster_run import DagsterRun\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass IDagsterMeta(Protocol):\n    @property\n    def dagster_meta(self) -> "DagsterLoggingMetadata": ...\n\n\n# The type-checker complains here that DagsterLogRecord does not implement the `dagster_meta`\n# property of `IDagsterMeta`. We ignore this error because we don't need to implement this method--\n# `DagsterLogRecord` is a stub class that is never instantiated. We only ever cast\n# `logging.LogRecord` objects to `DagsterLogRecord`, because it gives us typed access to the\n# `dagster_meta` property. `dagster_meta` itself is set on these `logging.LogRecord` objects via the\n# `extra` argument to `logging.Logger.log` (see `DagsterLogManager.log_dagster_event`), but\n# `logging.LogRecord` has no way of exposing to the type-checker the attributes that are dynamically\n# defined via `extra`.\nclass DagsterLogRecord(logging.LogRecord, IDagsterMeta):  # type: ignore\n    pass\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message."""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp,\n                "log_timestamp",\n                default=datetime.datetime.utcnow().isoformat(),\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return f'\\n\\n{getattr(event_specific_data, "error_display_string", error.to_string())}'\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("job_name", Optional[str]),\n            ("job_tags", Mapping[str, str]),\n            ("step_key", Optional[str]),\n            ("op_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.).\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        job_name: Optional[str] = None,\n        job_tags: Optional[Mapping[str, str]] = None,\n        step_key: Optional[str] = None,\n        op_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            job_name=job_name,\n            job_tags=job_tags or {},\n            step_key=step_key,\n            op_name=op_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self) -> str:\n        if self.resource_name is None:\n            return self.job_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def all_tags(self) -> Mapping[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n    def event_tags(self) -> Mapping[str, str]:\n        # Exclude pipeline_tags since it can be quite large and can be found on the run\n        return {k: str(v) for k, v in self._asdict().items() if k != "job_tags"}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n    from dagster._core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Mapping[str, object]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: Sequence[logging.Logger],\n        handlers: Sequence[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self) -> DagsterLoggingMetadata:\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags: str) -> "DagsterLogHandler":\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Mapping[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + [\n            "message",\n            "asctime",\n        ]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> DagsterLogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        # DagsterLogRecord is a LogRecord with a `dagster_meta` field\n        return cast(DagsterLogRecord, record)\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord) -> None:\n        """For any received record, add Dagster metadata, and have handlers handle it."""\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[Sequence[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_sequence_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: Sequence[logging.Logger],\n handlers: Optional[Sequence[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n dagster_run: Optional["DagsterRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n handlers = check.opt_sequence_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers = [*handlers, *instance.get_handlers()]\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if dagster_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=dagster_run.run_id,\n job_name=dagster_run.job_name,\n job_tags=dagster_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(\n self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"\n ) -> None:\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level: Union[str, int], msg: object, *args: Any, **kwargs: Any) -> None:\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags: str) -> "DagsterLogManager":\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/_core/log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.log_manager"}, "pipes": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.client

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, List, Optional, Sequence\n\nfrom dagster_pipes import (\n    DagsterPipesError,\n    PipesContextData,\n    PipesExtras,\n    PipesParams,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.result import MaterializeResult\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom .context import PipesExecutionResult\n\nif TYPE_CHECKING:\n    from .context import PipesMessageHandler\n\n\n
[docs]@experimental\nclass PipesClient(ABC):\n """Pipes client base class.\n\n Pipes clients for specific external environments should subclass this.\n """\n\n
[docs] @public\n @abstractmethod\n def run(\n self,\n *,\n context: OpExecutionContext,\n extras: Optional[PipesExtras] = None,\n **kwargs,\n ) -> "PipesClientCompletedInvocation":\n """Synchronously execute an external process with the pipes protocol. Derived\n clients must have `context` and `extras` arguments, but also can add arbitrary\n arguments that are appropriate for their own implementation.\n\n Args:\n context (OpExecutionContext): The context from the executing op/asset.\n extras (Optional[PipesExtras]): Arbitrary data to pass to the external environment.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """
\n\n\n@experimental\nclass PipesClientCompletedInvocation:\n def __init__(self, results: Sequence["PipesExecutionResult"]):\n self._results = results\n\n def get_results(self) -> Sequence["PipesExecutionResult"]:\n """Get the stream of results as a Sequence of a completed pipes\n client invocation. For each "report" call in the external process,\n one result object will be in the list.\n\n Returns: Sequence[PipesExecutionResult]\n """\n return tuple(self._results)\n\n def get_materialize_result(self) -> MaterializeResult:\n """Get a single materialize result for a pipes invocation. This coalesces\n the materialization result and any separately reported asset check results from\n the external process.\n\n This does not work on invocations that materialize multiple assets and will fail\n in that case. For multiple assets use `get_results` instead to get the result stream.\n\n Returns: MaterializeResult\n """\n return materialize_result_from_pipes_results(self.get_results())\n\n def get_asset_check_result(self) -> AssetCheckResult:\n """Get a single asset check result for a pipes invocation.\n\n This does not work on invocations that have anything except a single asset check result.\n Use `get_results` instead to get the result stream in those cases.\n\n Returns: AssetCheckResult\n """\n return _check_result_from_pipes_results(self.get_results())\n\n\n
[docs]@experimental\nclass PipesContextInjector(ABC):\n @abstractmethod\n @contextmanager\n def inject_context(self, context_data: "PipesContextData") -> Iterator[PipesParams]:\n """A `@contextmanager` that injects context data into the external process.\n\n This method should write the context data to a location accessible to the external\n process. It should yield parameters that the external process can use to locate and load the\n context data.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A JSON-serializable dict of parameters to be used used by the external\n process to locate and load the injected context data.\n """\n\n @abstractmethod\n def no_messages_debug_text(self) -> str:\n """A message to be displayed when no messages are received from the external process to aid with debugging.\n\n Example: "Attempted to inject context using a magic portal. Expected PipesMagicPortalContextLoader to be\n explicitly passed to open_dagster_pipes in the external process."\n """
\n\n\n
[docs]@experimental\nclass PipesMessageReader(ABC):\n @abstractmethod\n @contextmanager\n def read_messages(self, handler: "PipesMessageHandler") -> Iterator[PipesParams]:\n """A `@contextmanager` that reads messages reported by an external process.\n\n This method should start a thread to continuously read messages from some location\n accessible to the external process. It should yield parameters that the external process\n can use to direct its message output.\n\n Args:\n handler (PipesMessageHandler): The message handler to use to process messages read from\n the external process.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to determine\n where to write messages.\n """\n\n @abstractmethod\n def no_messages_debug_text(self) -> str:\n """A message to be displayed when no messages are received from the external process to aid with\n debugging.\n\n Example: "Attempted to read messages using a magic portal. Expected PipesMagicPortalMessageWriter\n to be explicitly passed to open_dagster_pipes in the external process."\n """
\n\n\ndef materialize_result_from_pipes_results(\n all_results: Sequence[PipesExecutionResult],\n) -> MaterializeResult:\n mat_results: List[MaterializeResult] = [\n mat_result for mat_result in all_results if isinstance(mat_result, MaterializeResult)\n ]\n check_results: List[AssetCheckResult] = [\n check_result for check_result in all_results if isinstance(check_result, AssetCheckResult)\n ]\n\n check.invariant(len(mat_results) > 0, "No materialization results received. Internal error?")\n if len(mat_results) > 1:\n raise DagsterPipesError(\n "Multiple materialize results returned with asset keys"\n f" {sorted([check.not_none(mr.asset_key).to_user_string() for mr in mat_results])}."\n " If you are materializing multiple assets in a pipes invocation, use"\n " get_results() instead.",\n )\n mat_result = next(iter(mat_results))\n for check_result in check_results:\n if check_result.asset_key:\n check.invariant(\n mat_result.asset_key == check_result.asset_key,\n "Check result specified an asset key that is not part of the returned"\n " materialization. If this was deliberate, use get_results() instead.",\n )\n\n if check_results:\n return mat_result._replace(\n check_results=[*(mat_result.check_results or []), *check_results]\n )\n else:\n return mat_result\n\n\ndef _check_result_from_pipes_results(\n all_results: Sequence[PipesExecutionResult],\n) -> AssetCheckResult:\n mat_results: List[MaterializeResult] = [\n mat_result for mat_result in all_results if isinstance(mat_result, MaterializeResult)\n ]\n check_results: List[AssetCheckResult] = [\n check_result for check_result in all_results if isinstance(check_result, AssetCheckResult)\n ]\n\n # return the single asset check result if thats what we got\n if len(mat_results) == 0 and len(check_results) == 1:\n return next(iter(check_results))\n\n # otherwise error\n raise DagsterPipesError(\n f"Did not find singular AssetCheckResult, got {len(mat_results)} MaterializeResults and"\n f" {len(check_results)} AssetCheckResults. Correct the reported results or use"\n " get_results() instead.",\n )\n
", "current_page_name": "_modules/dagster/_core/pipes/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.client"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.context

\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass\nfrom queue import Queue\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, Mapping, Optional, Set, Union\n\nfrom dagster_pipes import (\n    DAGSTER_PIPES_CONTEXT_ENV_VAR,\n    DAGSTER_PIPES_MESSAGES_ENV_VAR,\n    PIPES_METADATA_TYPE_INFER,\n    PipesContextData,\n    PipesDataProvenance,\n    PipesExtras,\n    PipesMessage,\n    PipesMetadataType,\n    PipesMetadataValue,\n    PipesParams,\n    PipesTimeWindow,\n    encode_env_var,\n)\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.data_version import DataProvenance, DataVersion\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import MetadataValue, normalize_metadata_value\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.result import MaterializeResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterPipesExecutionError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.invocation import BoundOpExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.pipes.client import PipesMessageReader\n\nPipesExecutionResult: TypeAlias = Union[MaterializeResult, AssetCheckResult]\n\n\n
[docs]@experimental\nclass PipesMessageHandler:\n """Class to process :py:obj:`PipesMessage` objects received from a pipes process.\n\n Args:\n context (OpExecutionContext): The context for the executing op/asset.\n """\n\n def __init__(self, context: OpExecutionContext) -> None:\n self._context = context\n # Queue is thread-safe\n self._result_queue: Queue[PipesExecutionResult] = Queue()\n # Only read by the main thread after all messages are handled, so no need for a lock\n self._unmaterialized_assets: Set[AssetKey] = set(context.selected_asset_keys)\n self._received_any_msg = False\n self._received_closed_msg = False\n\n @contextmanager\n def handle_messages(self, message_reader: "PipesMessageReader") -> Iterator[PipesParams]:\n with message_reader.read_messages(self) as params:\n yield params\n for key in self._unmaterialized_assets:\n self._result_queue.put(MaterializeResult(asset_key=key))\n\n def clear_result_queue(self) -> Iterator[PipesExecutionResult]:\n while not self._result_queue.empty():\n yield self._result_queue.get()\n\n @property\n def received_any_message(self) -> bool:\n return self._received_any_msg\n\n @property\n def received_closed_message(self) -> bool:\n return self._received_closed_msg\n\n def _resolve_metadata(\n self, metadata: Mapping[str, PipesMetadataValue]\n ) -> Mapping[str, MetadataValue]:\n return {\n k: self._resolve_metadata_value(v["raw_value"], v["type"]) for k, v in metadata.items()\n }\n\n def _resolve_metadata_value(\n self, value: Any, metadata_type: PipesMetadataType\n ) -> MetadataValue:\n if metadata_type == PIPES_METADATA_TYPE_INFER:\n return normalize_metadata_value(value)\n elif metadata_type == "text":\n return MetadataValue.text(value)\n elif metadata_type == "url":\n return MetadataValue.url(value)\n elif metadata_type == "path":\n return MetadataValue.path(value)\n elif metadata_type == "notebook":\n return MetadataValue.notebook(value)\n elif metadata_type == "json":\n return MetadataValue.json(value)\n elif metadata_type == "md":\n return MetadataValue.md(value)\n elif metadata_type == "float":\n return MetadataValue.float(value)\n elif metadata_type == "int":\n return MetadataValue.int(value)\n elif metadata_type == "bool":\n return MetadataValue.bool(value)\n elif metadata_type == "dagster_run":\n return MetadataValue.dagster_run(value)\n elif metadata_type == "asset":\n return MetadataValue.asset(AssetKey.from_user_string(value))\n elif metadata_type == "table":\n return MetadataValue.table(value)\n elif metadata_type == "null":\n return MetadataValue.null()\n else:\n check.failed(f"Unexpected metadata type {metadata_type}")\n\n # Type ignores because we currently validate in individual handlers\n def handle_message(self, message: PipesMessage) -> None:\n if self._received_closed_msg:\n self._context.log.warn(f"[pipes] unexpected message received after closed: `{message}`")\n\n if not self._received_any_msg:\n self._received_any_msg = True\n self._context.log.info("[pipes] external process successfully opened dagster pipes.")\n\n if message["method"] == "opened":\n pass\n elif message["method"] == "closed":\n self._handle_closed()\n elif message["method"] == "report_asset_materialization":\n self._handle_report_asset_materialization(**message["params"]) # type: ignore\n elif message["method"] == "report_asset_check":\n self._handle_report_asset_check(**message["params"]) # type: ignore\n elif message["method"] == "log":\n self._handle_log(**message["params"]) # type: ignore\n else:\n raise DagsterPipesExecutionError(f"Unknown message method: {message['method']}")\n\n def _handle_closed(self) -> None:\n self._received_closed_msg = True\n\n def _handle_report_asset_materialization(\n self,\n asset_key: str,\n metadata: Optional[Mapping[str, PipesMetadataValue]],\n data_version: Optional[str],\n ) -> None:\n check.str_param(asset_key, "asset_key")\n check.opt_str_param(data_version, "data_version")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n resolved_asset_key = AssetKey.from_user_string(asset_key)\n resolved_metadata = self._resolve_metadata(metadata)\n resolved_data_version = None if data_version is None else DataVersion(data_version)\n result = MaterializeResult(\n asset_key=resolved_asset_key,\n metadata=resolved_metadata,\n data_version=resolved_data_version,\n )\n self._result_queue.put(result)\n self._unmaterialized_assets.remove(resolved_asset_key)\n\n def _handle_report_asset_check(\n self,\n asset_key: str,\n check_name: str,\n passed: bool,\n severity: str,\n metadata: Mapping[str, PipesMetadataValue],\n ) -> None:\n check.str_param(asset_key, "asset_key")\n check.str_param(check_name, "check_name")\n check.bool_param(passed, "passed")\n check.literal_param(severity, "severity", [x.value for x in AssetCheckSeverity])\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n resolved_asset_key = AssetKey.from_user_string(asset_key)\n resolved_metadata = self._resolve_metadata(metadata)\n resolved_severity = AssetCheckSeverity(severity)\n result = AssetCheckResult(\n asset_key=resolved_asset_key,\n check_name=check_name,\n passed=passed,\n severity=resolved_severity,\n metadata=resolved_metadata,\n )\n self._result_queue.put(result)\n\n def _handle_log(self, message: str, level: str = "info") -> None:\n check.str_param(message, "message")\n self._context.log.log(level, message)
\n\n\n
[docs]@experimental\n@dataclass\nclass PipesSession:\n """Object representing a pipes session.\n\n A pipes session is defined by a pair of :py:class:`PipesContextInjector` and\n :py:class:`PipesMessageReader` objects. At the opening of the session, the context injector\n writes context data to an externally accessible location, and the message reader starts\n monitoring an externally accessible location. These locations are encoded in parameters stored\n on a `PipesSession` object.\n\n During the session, an external process should be started and the parameters injected into its\n environment. The typical way to do this is to call :py:meth:`PipesSession.get_bootstrap_env_vars`\n and pass the result as environment variables.\n\n During execution, results (e.g. asset materializations) are reported by the external process and\n buffered on the `PipesSession` object. The buffer can periodically be cleared and yielded to\n Dagster machinery by calling `yield from PipesSession.get_results()`.\n\n When the external process exits, the session can be closed. Closing consists of handling any\n unprocessed messages written by the external process and cleaning up any resources used for\n context injection and message reading.\n\n Args:\n context_data (PipesContextData): The context for the executing op/asset.\n message_handler (PipesMessageHandler): The message handler to use for processing messages\n context_injector_params (PipesParams): Parameters yielded by the context injector,\n indicating the location from which the external process should load context data.\n message_reader_params (PipesParams): Parameters yielded by the message reader, indicating\n the location to which the external process should write messages.\n """\n\n context_data: PipesContextData\n message_handler: PipesMessageHandler\n context_injector_params: PipesParams\n message_reader_params: PipesParams\n\n
[docs] @public\n def get_bootstrap_env_vars(self) -> Dict[str, str]:\n """Encode context injector and message reader params as environment variables.\n\n Passing environment variables is the typical way to expose the pipes I/O parameters\n to a pipes process.\n\n Returns:\n Mapping[str, str]: Environment variables to pass to the external process. The values are\n serialized as json, compressed with gzip, and then base-64-encoded.\n """\n return {\n param_name: encode_env_var(param_value)\n for param_name, param_value in self.get_bootstrap_params().items()\n }
\n\n
[docs] @public\n def get_bootstrap_params(self) -> Dict[str, Any]:\n """Get the params necessary to bootstrap a launched pipes process. These parameters are typically\n are as environment variable. See `get_bootstrap_env_vars`. It is the context injector's\n responsibility to decide how to pass these parameters to the external environment.\n\n Returns:\n Mapping[str, str]: Parameters to pass to the external process and their corresponding\n values that must be passed by the context injector.\n """\n return {\n DAGSTER_PIPES_CONTEXT_ENV_VAR: self.context_injector_params,\n DAGSTER_PIPES_MESSAGES_ENV_VAR: self.message_reader_params,\n }
\n\n
[docs] @public\n def get_results(self) -> Iterator[PipesExecutionResult]:\n """Iterator over buffered :py:class:`PipesExecutionResult` objects received from the\n external process.\n\n When this is called it clears the results buffer.\n\n Yields:\n ExtResult: Result reported by external process.\n """\n yield from self.message_handler.clear_result_queue()
\n\n\ndef build_external_execution_context_data(\n context: OpExecutionContext,\n extras: Optional[PipesExtras],\n) -> "PipesContextData":\n asset_keys = (\n [_convert_asset_key(key) for key in sorted(context.selected_asset_keys)]\n if context.has_assets_def\n else None\n )\n code_version_by_asset_key = (\n {\n _convert_asset_key(key): context.assets_def.code_versions_by_key[key]\n for key in context.selected_asset_keys\n }\n if context.has_assets_def\n else None\n )\n provenance_by_asset_key = (\n {\n _convert_asset_key(key): _convert_data_provenance(context.get_asset_provenance(key))\n for key in context.selected_asset_keys\n }\n if context.has_assets_def\n else None\n )\n partition_key = context.partition_key if context.has_partition_key else None\n partition_key_range = context.partition_key_range if context.has_partition_key else None\n partition_time_window = (\n context.partition_time_window\n if context.has_partition_key\n and has_one_dimension_time_window_partitioning(\n context.get_step_execution_context().partitions_def\n )\n else None\n )\n return PipesContextData(\n asset_keys=asset_keys,\n code_version_by_asset_key=code_version_by_asset_key,\n provenance_by_asset_key=provenance_by_asset_key,\n partition_key=partition_key,\n partition_key_range=(\n _convert_partition_key_range(partition_key_range) if partition_key_range else None\n ),\n partition_time_window=(\n _convert_time_window(partition_time_window) if partition_time_window else None\n ),\n run_id=context.run_id,\n job_name=None if isinstance(context, BoundOpExecutionContext) else context.job_name,\n retry_number=0 if isinstance(context, BoundOpExecutionContext) else context.retry_number,\n extras=extras or {},\n )\n\n\ndef _convert_asset_key(asset_key: AssetKey) -> str:\n return asset_key.to_user_string()\n\n\ndef _convert_data_provenance(\n provenance: Optional[DataProvenance],\n) -> Optional["PipesDataProvenance"]:\n return (\n None\n if provenance is None\n else PipesDataProvenance(\n code_version=provenance.code_version,\n input_data_versions={\n _convert_asset_key(k): v.value for k, v in provenance.input_data_versions.items()\n },\n is_user_provided=provenance.is_user_provided,\n )\n )\n\n\ndef _convert_time_window(\n time_window: TimeWindow,\n) -> "PipesTimeWindow":\n return PipesTimeWindow(\n start=time_window.start.isoformat(),\n end=time_window.end.isoformat(),\n )\n\n\ndef _convert_partition_key_range(\n partition_key_range: PartitionKeyRange,\n) -> "PipesTimeWindow":\n return PipesTimeWindow(\n start=partition_key_range.start,\n end=partition_key_range.end,\n )\n
", "current_page_name": "_modules/dagster/_core/pipes/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.context"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.utils

\nimport datetime\nimport json\nimport os\nimport sys\nimport tempfile\nimport time\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom threading import Event, Thread\nfrom typing import Iterator, Optional, TextIO\n\nfrom dagster_pipes import (\n    PIPES_PROTOCOL_VERSION_FIELD,\n    PipesContextData,\n    PipesDefaultContextLoader,\n    PipesDefaultMessageWriter,\n    PipesExtras,\n    PipesParams,\n)\n\nfrom dagster import (\n    OpExecutionContext,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.pipes.client import (\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n    PipesSession,\n    build_external_execution_context_data,\n)\nfrom dagster._utils import tail_file\n\n_CONTEXT_INJECTOR_FILENAME = "context"\n_MESSAGE_READER_FILENAME = "messages"\n\n\n
[docs]@experimental\nclass PipesFileContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by writing it to a\n specified file.\n\n Args:\n path (str): The path of a file to which to write context data. The file will be deleted on\n close of the pipes session.\n """\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @contextmanager\n def inject_context(self, context_data: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to a file as JSON and exposing the\n path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with open(self._path, "w") as input_stream:\n json.dump(context_data, input_stream)\n try:\n yield {PipesDefaultContextLoader.FILE_PATH_KEY: self._path}\n finally:\n if os.path.exists(self._path):\n os.remove(self._path)\n\n def no_messages_debug_text(self) -> str:\n return f"Attempted to inject context via file {self._path}"
\n\n\n
[docs]@experimental\nclass PipesTempFileContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by writing it to an\n automatically-generated temporary file.\n """\n\n @contextmanager\n def inject_context(self, context: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to an automatically-generated\n temporary file as JSON and exposing the path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with tempfile.TemporaryDirectory() as tempdir:\n with PipesFileContextInjector(\n os.path.join(tempdir, _CONTEXT_INJECTOR_FILENAME)\n ).inject_context(context) as params:\n yield params\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to inject context via a temporary file."
\n\n\n
[docs]class PipesEnvContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by injecting it directly into the external process environment."""\n\n @contextmanager\n def inject_context(\n self,\n context_data: "PipesContextData",\n ) -> Iterator[PipesParams]:\n """Inject context to external environment by embedding directly in the parameters that will\n be passed to the external process (typically as environment variables).\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n yield {PipesDefaultContextLoader.DIRECT_KEY: context_data}\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to inject context directly, typically as an environment variable."
\n\n\n
[docs]@experimental\nclass PipesFileMessageReader(PipesMessageReader):\n """Message reader that reads messages by tailing a specified file.\n\n Args:\n path (str): The path of the file to which messages will be written. The file will be deleted\n on close of the pipes session.\n """\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages from the external process by tailing the\n target file.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol messages.\n """\n is_task_complete = Event()\n thread = None\n try:\n open(self._path, "w").close() # create file\n thread = Thread(\n target=self._reader_thread, args=(handler, is_task_complete), daemon=True\n )\n thread.start()\n yield {PipesDefaultMessageWriter.FILE_PATH_KEY: self._path}\n finally:\n is_task_complete.set()\n if os.path.exists(self._path):\n os.remove(self._path)\n if thread:\n thread.join()\n\n def _reader_thread(self, handler: "PipesMessageHandler", is_resource_complete: Event) -> None:\n for line in tail_file(self._path, lambda: is_resource_complete.is_set()):\n message = json.loads(line)\n handler.handle_message(message)\n\n def no_messages_debug_text(self) -> str:\n return f"Attempted to read messages from file {self._path}."
\n\n\n
[docs]@experimental\nclass PipesTempFileMessageReader(PipesMessageReader):\n """Message reader that reads messages by tailing an automatically-generated temporary file."""\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages from the external process by an\n automatically-generated temporary file.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol messages.\n """\n with tempfile.TemporaryDirectory() as tempdir:\n with PipesFileMessageReader(\n os.path.join(tempdir, _MESSAGE_READER_FILENAME)\n ).read_messages(handler) as params:\n yield params\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages from a local temporary file."
\n\n\n# Number of seconds to wait after an external process has completed for stdio logs to become\n# available. If this is exceeded, proceed with exiting without picking up logs.\nWAIT_FOR_STDIO_LOGS_TIMEOUT = 60\n\n\n
[docs]@experimental\nclass PipesBlobStoreMessageReader(PipesMessageReader):\n """Message reader that reads a sequence of message chunks written by an external process into a\n blob store such as S3, Azure blob storage, or GCS.\n\n The reader maintains a counter, starting at 1, that is synchronized with a message writer in\n some pipes process. The reader starts a thread that periodically attempts to read a chunk\n indexed by the counter at some location expected to be written by the pipes process. The chunk\n should be a file with each line corresponding to a JSON-encoded pipes message. When a chunk is\n successfully read, the messages are processed and the counter is incremented. The\n :py:class:`PipesBlobStoreMessageWriter` on the other end is expected to similarly increment a\n counter (starting from 1) on successful write, keeping counters on the read and write end in\n sync.\n\n If `stdout_reader` or `stderr_reader` are passed, this reader will also start them when\n `read_messages` is called. If they are not passed, then the reader performs no stdout/stderr\n forwarding.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk\n stdout_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stdout logs.\n stderr_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stderr logs.\n """\n\n interval: float\n counter: int\n stdout_reader: "PipesBlobStoreStdioReader"\n stderr_reader: "PipesBlobStoreStdioReader"\n\n def __init__(\n self,\n interval: float = 10,\n stdout_reader: Optional["PipesBlobStoreStdioReader"] = None,\n stderr_reader: Optional["PipesBlobStoreStdioReader"] = None,\n ):\n self.interval = interval\n self.counter = 1\n self.stdout_reader = (\n check.opt_inst_param(stdout_reader, "stdout_reader", PipesBlobStoreStdioReader)\n or PipesNoOpStdioReader()\n )\n self.stderr_reader = (\n check.opt_inst_param(stderr_reader, "stderr_reader", PipesBlobStoreStdioReader)\n or PipesNoOpStdioReader()\n )\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages by periodically reading message chunks from a\n target location.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol message chunks.\n """\n with self.get_params() as params:\n is_task_complete = Event()\n messages_thread = None\n try:\n messages_thread = Thread(\n target=self._messages_thread, args=(handler, params, is_task_complete)\n )\n messages_thread.start()\n self.stdout_reader.start(params, is_task_complete)\n self.stderr_reader.start(params, is_task_complete)\n yield params\n finally:\n self.wait_for_stdio_logs(params)\n is_task_complete.set()\n if messages_thread:\n messages_thread.join()\n self.stdout_reader.stop()\n self.stderr_reader.stop()\n\n # In cases where we are forwarding logs, in some cases the logs might not be written out until\n # after the run completes. We wait for them to exist.\n def wait_for_stdio_logs(self, params):\n start_or_last_download = datetime.datetime.now()\n while (\n datetime.datetime.now() - start_or_last_download\n ).seconds <= WAIT_FOR_STDIO_LOGS_TIMEOUT and (\n (self.stdout_reader and not self.stdout_reader.is_ready(params))\n or (self.stderr_reader and not self.stderr_reader.is_ready(params))\n ):\n time.sleep(5)\n\n @abstractmethod\n @contextmanager\n def get_params(self) -> Iterator[PipesParams]:\n """Yield a set of parameters to be passed to a message writer in a pipes process.\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol message chunks.\n """\n\n @abstractmethod\n def download_messages_chunk(self, index: int, params: PipesParams) -> Optional[str]: ...\n\n def _messages_thread(\n self,\n handler: "PipesMessageHandler",\n params: PipesParams,\n is_task_complete: Event,\n ) -> None:\n start_or_last_download = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if (now - start_or_last_download).seconds > self.interval or is_task_complete.is_set():\n start_or_last_download = now\n chunk = self.download_messages_chunk(self.counter, params)\n if chunk:\n for line in chunk.split("\\n"):\n message = json.loads(line)\n handler.handle_message(message)\n self.counter += 1\n elif is_task_complete.is_set():\n break\n time.sleep(1)
\n\n\nclass PipesBlobStoreStdioReader(ABC):\n @abstractmethod\n def start(self, params: PipesParams, is_task_complete: Event) -> None: ...\n\n @abstractmethod\n def stop(self) -> None: ...\n\n @abstractmethod\n def is_ready(self, params: PipesParams) -> bool: ...\n\n\n@experimental\nclass PipesChunkedStdioReader(PipesBlobStoreStdioReader):\n """Reader for reading stdout/stderr logs from a blob store such as S3, Azure blob storage, or GCS.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk.\n target_stream (TextIO): The stream to which to write the logs. Typcially `sys.stdout` or `sys.stderr`.\n """\n\n def __init__(self, *, interval: float = 10, target_stream: TextIO):\n self.interval = interval\n self.target_stream = target_stream\n self.thread: Optional[Thread] = None\n\n @abstractmethod\n def download_log_chunk(self, params: PipesParams) -> Optional[str]: ...\n\n def start(self, params: PipesParams, is_task_complete: Event) -> None:\n self.thread = Thread(target=self._reader_thread, args=(params, is_task_complete))\n self.thread.start()\n\n def stop(self) -> None:\n if self.thread:\n self.thread.join()\n\n def _reader_thread(\n self,\n params: PipesParams,\n is_task_complete: Event,\n ) -> None:\n start_or_last_download = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if (\n (now - start_or_last_download).seconds > self.interval or is_task_complete.is_set()\n ) and self.is_ready(params):\n start_or_last_download = now\n chunk = self.download_log_chunk(params)\n if chunk:\n self.target_stream.write(chunk)\n elif is_task_complete.is_set():\n break\n time.sleep(self.interval)\n\n\nclass PipesNoOpStdioReader(PipesBlobStoreStdioReader):\n """Default implementation for a pipes stdio reader that does nothing."""\n\n def start(self, params: PipesParams, is_task_complete: Event) -> None:\n pass\n\n def stop(self) -> None:\n pass\n\n def is_ready(self, params: PipesParams) -> bool:\n return True\n\n\ndef extract_message_or_forward_to_stdout(handler: "PipesMessageHandler", log_line: str):\n # exceptions as control flow, you love to see it\n try:\n message = json.loads(log_line)\n if PIPES_PROTOCOL_VERSION_FIELD in message.keys():\n handler.handle_message(message)\n else:\n sys.stdout.writelines((log_line, "\\n"))\n except Exception:\n # move non-message logs in to stdout for compute log capture\n sys.stdout.writelines((log_line, "\\n"))\n\n\n_FAIL_TO_YIELD_ERROR_MESSAGE = (\n "Did you forget to `yield from pipes_session.get_results()` or `return"\n " <PipesClient>.run(...).get_results`? If using `open_pipes_session`,"\n " `pipes_session.get_results` should be called once after the `open_pipes_session` block has"\n " exited to yield any remaining buffered results via `<PipesSession>.get_results()`."\n " If using `<PipesClient>.run`, you should always return"\n " `<PipesClient>.run(...).get_results()` or `<PipesClient>.run(...).get_materialize_result()`."\n)\n\n\n
[docs]@experimental\n@contextmanager\ndef open_pipes_session(\n context: OpExecutionContext,\n context_injector: PipesContextInjector,\n message_reader: PipesMessageReader,\n extras: Optional[PipesExtras] = None,\n) -> Iterator[PipesSession]:\n """Context manager that opens and closes a pipes session.\n\n This context manager should be used to wrap the launch of an external process using the pipe\n protocol to report results back to Dagster. The yielded :py:class:`PipesSession` should be used\n to (a) obtain the environment variables that need to be provided to the external process; (b)\n access results streamed back from the external process.\n\n This method is an alternative to :py:class:`PipesClient` subclasses for users who want more\n control over how pipes processes are launched. When using `open_pipes_session`, it is the user's\n responsibility to inject the message reader and context injector parameters available on the\n yielded `PipesSession` and pass them to the appropriate API when launching the external process.\n Typically these parameters should be set as environment variables.\n\n\n Args:\n context (OpExecutionContext): The context for the current op/asset execution.\n context_injector (PipesContextInjector): The context injector to use to inject context into the external process.\n message_reader (PipesMessageReader): The message reader to use to read messages from the external process.\n extras (Optional[PipesExtras]): Optional extras to pass to the external process via the injected context.\n\n Yields:\n PipesSession: Interface for interacting with the external process.\n\n .. code-block:: python\n\n import subprocess\n from dagster import open_pipes_session\n\n extras = {"foo": "bar"}\n\n @asset\n def ext_asset(context: OpExecutionContext):\n with open_pipes_session(\n context=context,\n extras={"foo": "bar"},\n context_injector=ExtTempFileContextInjector(),\n message_reader=ExtTempFileMessageReader(),\n ) as pipes_session:\n subprocess.Popen(\n ["/bin/python", "/path/to/script.py"],\n env={**pipes_session.get_bootstrap_env_vars()}\n )\n while process.poll() is None:\n yield from pipes_session.get_results()\n\n yield from pipes_session.get_results()\n """\n context.set_requires_typed_event_stream(error_message=_FAIL_TO_YIELD_ERROR_MESSAGE)\n context_data = build_external_execution_context_data(context, extras)\n message_handler = PipesMessageHandler(context)\n try:\n with context_injector.inject_context(\n context_data\n ) as ci_params, message_handler.handle_messages(message_reader) as mr_params:\n yield PipesSession(\n context_data=context_data,\n message_handler=message_handler,\n context_injector_params=ci_params,\n message_reader_params=mr_params,\n )\n finally:\n if not message_handler.received_any_message:\n context.log.warn(\n "[pipes] did not receive any messages from external process. Check stdout / stderr"\n " logs from the external process if"\n f" possible.\\n{context_injector.__class__.__name__}:"\n f" {context_injector.no_messages_debug_text()}\\n{message_reader.__class__.__name__}:"\n f" {message_reader.no_messages_debug_text()}\\n"\n )\n elif not message_handler.received_closed_message:\n context.log.warn(\n "[pipes] did not receive closed message from external process. Buffered messages"\n " may have been discarded without being delivered. Use `open_dagster_pipes` as a"\n " context manager (a with block) to ensure that cleanup is successfully completed."\n " If that is not possible, manually call `PipesContext.close()` before process"\n " exit."\n )
\n
", "current_page_name": "_modules/dagster/_core/pipes/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.utils"}}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.default_run_coordinator

\nimport logging\nfrom typing import Mapping, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._logger = logging.getLogger("dagster.run_coordinator.default_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, object]\n ) -> Self:\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n self._instance.launch_run(dagster_run.run_id, context.workspace)\n else:\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping launch."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/default_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.queued_run_coordinator

\nimport logging\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    IntSource,\n    String,\n    _check as check,\n)\nfrom dagster._builtins import Bool\nfrom dagster._config import Array, Field, Noneable, ScalarUnion, Shape\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [\n            ("max_concurrent_runs", int),\n            ("tag_concurrency_limits", Sequence[Mapping[str, Any]]),\n            ("max_user_code_failure_retries", int),\n            ("user_code_failure_retry_delay", int),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        max_concurrent_runs: int,\n        tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]],\n        max_user_code_failure_retries: int = 0,\n        user_code_failure_retry_delay: int = 60,\n    ):\n        return super(RunQueueConfig, cls).__new__(\n            cls,\n            check.int_param(max_concurrent_runs, "max_concurrent_runs"),\n            check.opt_sequence_param(tag_concurrency_limits, "tag_concurrency_limits"),\n            check.int_param(max_user_code_failure_retries, "max_user_code_failure_retries"),\n            check.int_param(user_code_failure_retry_delay, "user_code_failure_retry_delay"),\n        )\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator[T_DagsterInstance], ConfigurableClass):\n """Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs: Optional[int] = None,\n tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]] = None,\n dequeue_interval_seconds: Optional[int] = None,\n dequeue_use_threads: Optional[bool] = None,\n dequeue_num_workers: Optional[int] = None,\n max_user_code_failure_retries: Optional[int] = None,\n user_code_failure_retry_delay: Optional[int] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data: Optional[ConfigurableClassData] = check.opt_inst_param(\n inst_data, "inst_data", ConfigurableClassData\n )\n self._max_concurrent_runs: int = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n check.invariant(\n self._max_concurrent_runs >= -1,\n "Negative values other than -1 (which disables the limit) for max_concurrent_runs"\n " are disallowed.",\n )\n self._tag_concurrency_limits: Sequence[Mapping[str, Any]] = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds: int = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n self._dequeue_use_threads: bool = check.opt_bool_param(\n dequeue_use_threads, "dequeue_use_threads", False\n )\n self._dequeue_num_workers: Optional[int] = check.opt_int_param(\n dequeue_num_workers, "dequeue_num_workers"\n )\n self._max_user_code_failure_retries: int = check.opt_int_param(\n max_user_code_failure_retries, "max_user_code_failure_retries", 0\n )\n self._user_code_failure_retry_delay: int = check.opt_int_param(\n user_code_failure_retry_delay, "user_code_failure_retry_delay", 60\n )\n self._logger = logging.getLogger("dagster.run_coordinator.queued_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_run_queue_config(self) -> RunQueueConfig:\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n max_user_code_failure_retries=self._max_user_code_failure_retries,\n user_code_failure_retry_delay=self._user_code_failure_retry_delay,\n )\n\n @property\n def dequeue_interval_seconds(self) -> int:\n return self._dequeue_interval_seconds\n\n @property\n def dequeue_use_threads(self) -> bool:\n return self._dequeue_use_threads\n\n @property\n def dequeue_num_workers(self) -> Optional[int]:\n return self._dequeue_num_workers\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The maximum number of runs that are allowed to be in progress at once."\n " Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs"\n " from launching. Any other negative values are disallowed."\n ),\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description=(\n "A set of limits that are applied to runs with particular tags. If a value is"\n " set, the limit is applied to only that key-value pair. If no value is set,"\n " the limit is applied across all values of that key. If the value is set to a"\n " dict with `applyLimitPerUniqueValue: true`, the limit will apply to the"\n " number of unique values for that key."\n ),\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch."\n ),\n ),\n "dequeue_use_threads": Field(\n config=bool,\n is_required=False,\n description=(\n "Whether or not to use threads for concurrency when launching dequeued runs."\n ),\n ),\n "dequeue_num_workers": Field(\n config=IntSource,\n is_required=False,\n description=(\n "If dequeue_use_threads is true, limit the number of concurrent worker threads."\n ),\n ),\n "max_user_code_failure_retries": Field(\n config=IntSource,\n is_required=False,\n default_value=0,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how many times to retry the dequeue before failing it. The only run launcher"\n " that requires the gRPC server to be running is the DefaultRunLauncher, so"\n " setting this will have no effect unless that run launcher is being used."\n ),\n ),\n "user_code_failure_retry_delay": Field(\n config=IntSource,\n is_required=False,\n default_value=60,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how long to wait before retrying any runs from that same code location. The"\n " only run launcher that requires the gRPC server to be running is the"\n " DefaultRunLauncher, so setting this will have no effect unless that run"\n " launcher is being used."\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n dequeue_use_threads=config_value.get("dequeue_use_threads"),\n dequeue_num_workers=config_value.get("dequeue_num_workers"),\n max_user_code_failure_retries=config_value.get("max_user_code_failure_retries"),\n user_code_failure_retry_delay=config_value.get("user_code_failure_retry_delay"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n job_name=dagster_run.job_name,\n )\n self._instance.report_dagster_event(enqueued_event, run_id=dagster_run.run_id)\n else:\n # the run was already submitted, this is a no-op\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping enqueue."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == DagsterRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/queued_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config import Field, IntSource\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterError\nfrom dagster._core.host_representation import ExternalSchedule\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors."""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", Sequence[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", Sequence[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: Sequence[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: Sequence[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.sequence_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.sequence_param(\n                schedule_storage, "schedule_storage", of_type=str\n            ),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(\n self, instance: DagsterInstance, external_schedule: ExternalSchedule\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not stored_state:\n started_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_state)\n else:\n started_state = stored_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_state)\n return started_state\n\n def stop_schedule(\n self,\n instance: DagsterInstance,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional[ExternalSchedule],\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(schedule_origin_id, schedule_selector_id)\n\n if not external_schedule:\n computed_state = stored_state\n else:\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n\n if computed_state and not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_schedule\n stopped_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_state)\n else:\n stopped_state = stored_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=computed_state.instigator_data.cron_schedule, # type: ignore\n )\n )\n instance.update_instigator_state(stopped_state)\n\n return stopped_state\n\n @abc.abstractmethod\n def debug_info(self) -> str:\n """Returns debug information about the scheduler."""\n\n @abc.abstractmethod\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n """Get path to store logs for schedule.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self,\n max_catchup_runs: int = DEFAULT_MAX_CATCHUP_RUNS,\n max_tick_retries: int = 0,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description=(\n "For each schedule tick that raises an error, how many times to retry that tick"\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self) -> str:\n return ""\n\n def wipe(self, instance: DagsterInstance) -> None:\n pass\n\n def _get_or_create_logs_directory(\n self, instance: DagsterInstance, schedule_origin_id: str\n ) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/_core/scheduler/scheduler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.scheduler.scheduler"}}, "storage": {"asset_value_loader": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.asset_value_loader

\nfrom contextlib import ExitStack\nfrom typing import Any, Dict, Mapping, Optional, Type, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.job_definition import (\n    default_job_io_manager_with_fs_io_manager_schema,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.execution.build_resources import build_resources, get_mapped_resource_config\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.output import build_output_context\nfrom dagster._core.execution.resources_init import get_transitive_required_resource_keys\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.config import is_dagster_home_set\nfrom dagster._core.types.dagster_type import resolve_dagster_type\nfrom dagster._utils.merger import merge_dicts\n\nfrom .io_manager import IOManager\n\n\n
[docs]class AssetValueLoader:\n """Caches resource definitions that are used to load asset values across multiple load\n invocations.\n\n Should not be instantiated directly. Instead, use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`.\n """\n\n def __init__(\n self,\n assets_defs_by_key: Mapping[AssetKey, AssetsDefinition],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n instance: Optional[DagsterInstance] = None,\n ):\n self._assets_defs_by_key = assets_defs_by_key\n self._source_assets_by_key = source_assets_by_key\n self._resource_instance_cache: Dict[str, object] = {}\n self._exit_stack: ExitStack = ExitStack().__enter__()\n if not instance and is_dagster_home_set():\n self._instance = self._exit_stack.enter_context(DagsterInstance.get())\n else:\n self._instance = instance\n\n def _ensure_resource_instances_in_cache(\n self,\n resource_defs: Mapping[str, ResourceDefinition],\n resource_config: Optional[Mapping[str, Any]] = None,\n ):\n for built_resource_key, built_resource in (\n self._exit_stack.enter_context(\n build_resources(\n resources={\n resource_key: self._resource_instance_cache.get(resource_key, resource_def)\n for resource_key, resource_def in resource_defs.items()\n },\n instance=self._instance,\n resource_config=resource_config,\n )\n )\n ._asdict()\n .items()\n ):\n self._resource_instance_cache[built_resource_key] = built_resource\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type[object]] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n ) -> object:\n """Loads the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n asset_key = AssetKey.from_coercible(asset_key)\n resource_config = resource_config or {}\n output_metadata = {}\n\n if asset_key in self._assets_defs_by_key:\n assets_def = self._assets_defs_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n assets_def.resource_defs,\n )\n io_manager_key = assets_def.get_io_manager_key_for_asset_key(asset_key)\n io_manager_def = resource_defs[io_manager_key]\n name = assets_def.get_output_name_for_asset_key(asset_key)\n output_metadata = assets_def.metadata_by_key[asset_key]\n op_def = assets_def.get_op_def_for_asset_key(asset_key)\n asset_partitions_def = assets_def.partitions_def\n elif asset_key in self._source_assets_by_key:\n source_asset = self._source_assets_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n source_asset.resource_defs,\n )\n io_manager_key = source_asset.get_io_manager_key()\n io_manager_def = resource_defs[io_manager_key]\n name = asset_key.path[-1]\n output_metadata = source_asset.raw_metadata\n op_def = None\n asset_partitions_def = source_asset.partitions_def\n else:\n check.failed(f"Asset key {asset_key} not found")\n\n required_resource_keys = get_transitive_required_resource_keys(\n io_manager_def.required_resource_keys, resource_defs\n ) | {io_manager_key}\n\n self._ensure_resource_instances_in_cache(\n {k: v for k, v in resource_defs.items() if k in required_resource_keys},\n resource_config=resource_config,\n )\n io_manager = cast(IOManager, self._resource_instance_cache[io_manager_key])\n\n io_config = resource_config.get(io_manager_key)\n io_resource_config = {io_manager_key: io_config} if io_config else {}\n\n io_manager_config = get_mapped_resource_config(\n {io_manager_key: io_manager_def}, io_resource_config\n )\n\n input_context = build_input_context(\n name=None,\n asset_key=asset_key,\n dagster_type=resolve_dagster_type(python_type),\n upstream_output=build_output_context(\n name=name,\n metadata=output_metadata,\n asset_key=asset_key,\n op_def=op_def,\n resource_config=resource_config,\n ),\n resources=self._resource_instance_cache,\n resource_config=io_manager_config[io_manager_key].config,\n partition_key=partition_key,\n asset_partition_key_range=(\n PartitionKeyRange(partition_key, partition_key)\n if partition_key is not None\n else None\n ),\n asset_partitions_def=asset_partitions_def,\n instance=self._instance,\n metadata=metadata,\n )\n\n return io_manager.load_input(input_context)
\n\n def __enter__(self):\n return self\n\n def __exit__(self, *exc):\n self._exit_stack.close()
\n
", "current_page_name": "_modules/dagster/_core/storage/asset_value_loader", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.asset_value_loader"}, "base_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.base_storage

\nfrom abc import ABC, abstractmethod\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\n\nfrom .event_log.base import EventLogStorage\nfrom .runs.base import RunStorage\nfrom .schedules.base import ScheduleStorage\n\n\n
[docs]class DagsterStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for Dagster persistent storage, for reading and writing data for runs,\n events, and schedule/sensor state.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-daemon`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @property\n @abstractmethod\n def event_log_storage(self) -> EventLogStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def run_storage(self) -> RunStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def schedule_storage(self) -> ScheduleStorage[T_DagsterInstance]:\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/base_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.base_storage"}, "captured_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.captured_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import IO, Callable, Generator, Iterator, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Final, Self\n\nimport dagster._check as check\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\n\nMAX_BYTES_CHUNK_READ: Final = 4194304  # 4 MB\n\n\nclass CapturedLogContext(\n    NamedTuple(\n        "_CapturedLogContext",\n        [\n            ("log_key", Sequence[str]),\n            ("external_url", Optional[str]),\n            ("external_stdout_url", Optional[str]),\n            ("external_stderr_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing the context in which logs are captured.  Can be used by external logging\n    sidecar implementations to point the Dagster UI to an external url to view compute logs instead of a\n    Dagster-managed location.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        external_stdout_url: Optional[str] = None,\n        external_stderr_url: Optional[str] = None,\n        external_url: Optional[str] = None,\n    ):\n        if external_url and (external_stdout_url or external_stderr_url):\n            check.failed(\n                "Cannot specify both `external_url` and one of"\n                " `external_stdout_url`/`external_stderr_url`"\n            )\n\n        return super(CapturedLogContext, cls).__new__(\n            cls,\n            log_key,\n            external_stdout_url=external_stdout_url,\n            external_stderr_url=external_stderr_url,\n            external_url=external_url,\n        )\n\n\nclass CapturedLogData(\n    NamedTuple(\n        "_CapturedLogData",\n        [\n            ("log_key", Sequence[str]),\n            ("stdout", Optional[bytes]),\n            ("stderr", Optional[bytes]),\n            ("cursor", Optional[str]),\n        ],\n    )\n):\n    """Object representing captured log data, either a partial chunk of the log data or the full\n    capture.  Contains the raw bytes and optionally the cursor offset for the partial chunk.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        stdout: Optional[bytes] = None,\n        stderr: Optional[bytes] = None,\n        cursor: Optional[str] = None,\n    ):\n        return super(CapturedLogData, cls).__new__(cls, log_key, stdout, stderr, cursor)\n\n\nclass CapturedLogMetadata(\n    NamedTuple(\n        "_CapturedLogMetadata",\n        [\n            ("stdout_location", Optional[str]),\n            ("stderr_location", Optional[str]),\n            ("stdout_download_url", Optional[str]),\n            ("stderr_download_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing metadata info for the captured log data, containing a display string for\n    the location of the log data and a URL for direct download of the captured log data.\n    """\n\n    def __new__(\n        cls,\n        stdout_location: Optional[str] = None,\n        stderr_location: Optional[str] = None,\n        stdout_download_url: Optional[str] = None,\n        stderr_download_url: Optional[str] = None,\n    ):\n        return super(CapturedLogMetadata, cls).__new__(\n            cls,\n            stdout_location=stdout_location,\n            stderr_location=stderr_location,\n            stdout_download_url=stdout_download_url,\n            stderr_download_url=stderr_download_url,\n        )\n\n\nclass CapturedLogSubscription:\n    def __init__(\n        self, manager: "CapturedLogManager", log_key: Sequence[str], cursor: Optional[str]\n    ):\n        self._manager = manager\n        self._log_key = log_key\n        self._cursor = cursor\n        self._observer: Optional[Callable[[CapturedLogData], None]] = None\n        self.is_complete = False\n\n    def __call__(self, observer: Optional[Callable[[CapturedLogData], None]]) -> Self:\n        self._observer = observer\n        self.fetch()\n        if self._manager.is_capture_complete(self._log_key):\n            self.complete()\n        return self\n\n    @property\n    def log_key(self) -> Sequence[str]:\n        return self._log_key\n\n    def dispose(self) -> None:\n        self._observer = None\n        self._manager.unsubscribe(self)\n\n    def fetch(self) -> None:\n        if not self._observer:\n            return\n\n        should_fetch = True\n        while should_fetch:\n            log_data = self._manager.get_log_data(\n                self._log_key,\n                self._cursor,\n                max_bytes=MAX_BYTES_CHUNK_READ,\n            )\n            if not self._cursor or log_data.cursor != self._cursor:\n                self._observer(log_data)\n                self._cursor = log_data.cursor\n            should_fetch = _has_max_data(log_data.stdout) or _has_max_data(log_data.stderr)\n\n    def complete(self) -> None:\n        self.is_complete = True\n\n\ndef _has_max_data(chunk: Optional[bytes]) -> bool:\n    # function is used as predicate but does not actually return a boolean\n    return chunk and len(chunk) >= MAX_BYTES_CHUNK_READ  # type: ignore\n\n\n
[docs]class CapturedLogManager(ABC):\n """Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\n process, stored / retrieved with a provided log_key.\n """\n\n @abstractmethod\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n """Context manager for capturing the stdout/stderr within the current process, and persisting\n it under the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO[bytes]]]:\n """Context manager for providing an IO stream that enables the caller to write to a log stream\n managed by the captured log manager, to be read later using the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n """Flag indicating when the log capture for a given log key has completed.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n """Returns a chunk of the captured stdout logs for a given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[str]): A cursor representing the position of the log chunk to fetch\n max_bytes (Optional[int]): A limit on the size of the log chunk to fetch\n\n Returns:\n CapturedLogData\n """\n\n @abstractmethod\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n """Returns the metadata of the captured logs for a given log key, including\n displayable information on where the logs are persisted.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n CapturedLogMetadata\n """\n\n @abstractmethod\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ) -> None:\n """Deletes the captured logs for a given log key.\n\n Args:\n log_key(Optional[List[String]]): The log key of the logs to delete\n prefix(Optional[List[String]]): The prefix of the log keys to delete\n """\n\n @abstractmethod\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n """Registers an observable object for log data.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[String]): The string cursor marking the position within the log stream\n Returns:\n ComputeLogSubscription\n """\n\n @abstractmethod\n def unsubscribe(self, subscription: CapturedLogSubscription) -> None:\n """Deregisters an observable object from receiving log updates.\n\n Args:\n subscription (CapturedLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def build_log_key_for_run(self, run_id: str, step_key: str) -> Sequence[str]:\n """Legacy adapter to translate run_id/key to captured log manager-based log_key."""\n return [run_id, "compute_logs", step_key]
\n
", "current_page_name": "_modules/dagster/_core/storage/captured_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.captured_log_manager"}, "compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import Callable, Iterator, NamedTuple, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data."""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids.\n """\n\n @contextmanager\n def watch(self, dagster_run: DagsterRun, step_key: Optional[str] = None) -> Iterator[None]:\n """Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(dagster_run, step_key):\n yield\n return\n\n self.on_watch_start(dagster_run, step_key)\n with self._watch_logs(dagster_run, step_key):\n yield\n self.on_watch_finish(dagster_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n """Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n ...\n\n @abstractmethod\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _dagster_run: DagsterRun, _step_key: Optional[str]) -> bool:\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription: "ComputeLogSubscription") -> None:\n """Hook for managing streaming subscriptions for log data from `dagster-webserver`.\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription: "ComputeLogSubscription") -> None:\n pass\n\n def observable(\n self, run_id: str, key: str, io_type: ComputeIOType, cursor: Optional[str] = None\n ) -> "ComputeLogSubscription":\n """Return a ComputeLogSubscription which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor) # type: ignore # (var reassigned diff type)\n else:\n cursor = 0 # type: ignore # (var reassigned diff type)\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor) # type: ignore # (var reassigned diff type)\n self.on_subscribe(subscription)\n return subscription\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written.\n """\n\n def __init__(\n self,\n manager: ComputeLogManager,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int,\n ):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer: Optional[Callable[[ComputeLogFileData], None]] = None\n self.is_complete = False\n\n def __call__(self, observer: Callable[[ComputeLogFileData], None]) -> Self:\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self) -> None:\n # called when the connection gets closed, allowing the observer to get GC'ed\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self) -> None:\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self) -> None:\n self.is_complete = True\n if not self.observer:\n return\n
", "current_page_name": "_modules/dagster/_core/storage/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.compute_log_manager"}, "dagster_run": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.dagster_run

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    REPOSITORY_LABEL_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.external import ExternalSchedule, ExternalSensor\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\n
[docs]@whitelist_for_serdes(storage_name="PipelineRunStatus")\nclass DagsterRunStatus(Enum):\n """The status of run execution."""\n\n # Runs waiting to be launched by the Dagster Daemon.\n QUEUED = "QUEUED"\n\n # Runs that have been launched, but execution has not yet started."""\n NOT_STARTED = "NOT_STARTED"\n\n # Runs that are managed outside of the Dagster control plane.\n MANAGED = "MANAGED"\n\n # Runs that have been launched, but execution has not yet started.\n STARTING = "STARTING"\n\n # Runs that have been launched and execution has started.\n STARTED = "STARTED"\n\n # Runs that have successfully completed.\n SUCCESS = "SUCCESS"\n\n # Runs that have failed to complete.\n FAILURE = "FAILURE"\n\n # Runs that are in-progress and pending to be canceled.\n CANCELING = "CANCELING"\n\n # Runs that have been canceled before completion.\n CANCELED = "CANCELED"
\n\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.STARTING,\n DagsterRunStatus.STARTED,\n DagsterRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.QUEUED,\n DagsterRunStatus.NOT_STARTED,\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.MANAGED,\n DagsterRunStatus.CANCELED,\n]\n\nFINISHED_STATUSES = [\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.CANCELED,\n]\n\n# Run statuses for runs that can be safely canceled.\n# Does not include the other unfinished statuses for the following reasons:\n# STARTING: Control has been ceded to the run worker, which will eventually move the run to a STARTED.\n# NOT_STARTED: Mostly replaced with STARTING. Runs are only here in the the brief window between\n# creating the run and launching or enqueueing it.\nCANCELABLE_RUN_STATUSES = [DagsterRunStatus.STARTED, DagsterRunStatus.QUEUED]\n\n\n@whitelist_for_serdes(storage_name="PipelineRunStatsSnapshot")\nclass DagsterRunStatsSnapshot(\n NamedTuple(\n "_DagsterRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(DagsterRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(NamedTupleSerializer["DagsterRun"]):\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n # * added asset_selection\n # * added has_repository_load_data\n def before_unpack(self, context, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n # back compat for environment dict => run_config\n if "environment_dict" in unpacked_dict:\n check.invariant(\n unpacked_dict.get("run_config") is None,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n unpacked_dict["run_config"] = unpacked_dict["environment_dict"]\n del unpacked_dict["environment_dict"]\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if "previous_run_id" in unpacked_dict and not (\n "parent_run_id" in unpacked_dict and "root_run_id" in unpacked_dict\n ):\n unpacked_dict["parent_run_id"] = unpacked_dict["previous_run_id"]\n unpacked_dict["root_run_id"] = unpacked_dict["previous_run_id"]\n del unpacked_dict["previous_run_id"]\n\n # back compat for selector => pipeline_name, solids_to_execute\n if "selector" in unpacked_dict:\n selector = unpacked_dict["selector"]\n\n if not isinstance(selector, ExecutionSelector):\n check.failed(f"unexpected entry for 'select', {selector}")\n selector_name = selector.name\n selector_subset = selector.solid_subset\n\n job_name = unpacked_dict.get("pipeline_name")\n check.invariant(\n job_name is None or selector_name == job_name,\n f"Conflicting pipeline name {job_name} in arguments to PipelineRun: "\n f"selector was passed with pipeline {selector_name}",\n )\n if job_name is None:\n unpacked_dict["pipeline_name"] = selector_name\n\n solids_to_execute = unpacked_dict.get("solids_to_execute")\n check.invariant(\n solids_to_execute is None\n or (selector_subset and set(selector_subset) == solids_to_execute),\n f"Conflicting solids_to_execute {solids_to_execute} in arguments to"\n f" PipelineRun: selector was passed with subset {selector_subset}",\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector_subset) if selector_subset else None\n\n # back compat for solid_subset => solids_to_execute\n if "solid_subset" in unpacked_dict:\n unpacked_dict["solids_to_execute"] = unpacked_dict["solid_subset"]\n del unpacked_dict["solid_subset"]\n\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterRunSerializer,\n # DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version\n # of Dagster, but is read back in as a DagsterRun.\n storage_name="PipelineRun",\n old_fields={"mode": None},\n storage_field_names={\n "job_name": "pipeline_name",\n "job_snapshot_id": "pipeline_snapshot_id",\n "external_job_origin": "external_pipeline_origin",\n "job_code_origin": "pipeline_code_origin",\n "op_selection": "solid_selection",\n "resolved_op_selection": "solids_to_execute",\n },\n)\nclass DagsterRun(\n NamedTuple(\n "_DagsterRun",\n [\n ("job_name", PublicAttr[str]),\n ("run_id", str),\n ("run_config", Mapping[str, object]),\n ("asset_selection", Optional[AbstractSet[AssetKey]]),\n ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n ("op_selection", Optional[Sequence[str]]),\n ("resolved_op_selection", Optional[AbstractSet[str]]),\n ("step_keys_to_execute", Optional[Sequence[str]]),\n ("status", DagsterRunStatus),\n ("tags", Mapping[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("job_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_job_origin", Optional["ExternalJobOrigin"]),\n ("job_code_origin", Optional[JobPythonOrigin]),\n ("has_repository_load_data", bool),\n ],\n )\n):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n job_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n op_selection: Optional[Sequence[str]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n status: Optional[DagsterRunStatus] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n job_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n has_repository_load_data: Optional[bool] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group",\n )\n # a set which contains the names of the ops to execute\n resolved_op_selection = check.opt_nullable_set_param(\n resolved_op_selection, "resolved_op_selection", of_type=str\n )\n # a list of op queries provided by the user\n # possible to be None when resolved_op_selection is set by the user directly\n op_selection = check.opt_nullable_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n asset_selection = check.opt_nullable_set_param(\n asset_selection, "asset_selection", of_type=AssetKey\n )\n asset_check_selection = check.opt_nullable_set_param(\n asset_check_selection, "asset_check_selection", of_type=AssetCheckKey\n )\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n if status == DagsterRunStatus.QUEUED:\n check.inst_param(\n external_job_origin,\n "external_job_origin",\n ExternalJobOrigin,\n "external_job_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(DagsterRun, cls).__new__(\n cls,\n job_name=check.str_param(job_name, "job_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_mapping_param(run_config, "run_config", key_type=str),\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", DagsterRunStatus, DagsterRunStatus.NOT_STARTED\n ),\n tags=check.opt_mapping_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n job_snapshot_id=check.opt_str_param(job_snapshot_id, "job_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_job_origin=check.opt_inst_param(\n external_job_origin, "external_job_origin", ExternalJobOrigin\n ),\n job_code_origin=check.opt_inst_param(\n job_code_origin, "job_code_origin", JobPythonOrigin\n ),\n has_repository_load_data=check.opt_bool_param(\n has_repository_load_data, "has_repository_load_data", default=False\n ),\n )\n\n def with_status(self, status: DagsterRunStatus) -> Self:\n if status == DagsterRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst(\n self.external_job_origin,\n ExternalJobOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_job_origin(self, origin: "ExternalJobOrigin") -> Self:\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst_param(origin, "origin", ExternalJobOrigin)\n return self._replace(external_job_origin=origin)\n\n def with_tags(self, tags: Mapping[str, str]) -> Self:\n return self._replace(tags=tags)\n\n def get_root_run_id(self) -> Optional[str]:\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self) -> Optional[str]:\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n def tags_for_storage(self) -> Mapping[str, str]:\n repository_tags = {}\n if self.external_job_origin:\n # tag the run with a label containing the repository name / location name, to allow for\n # per-repository filtering of runs from the Dagster UI.\n repository_tags[REPOSITORY_LABEL_TAG] = (\n self.external_job_origin.external_repository_origin.get_label()\n )\n\n if not self.tags:\n return repository_tags\n\n return {**repository_tags, **self.tags}\n\n @public\n @property\n def is_finished(self) -> bool:\n """bool: If this run has completely finished execution."""\n return self.status in FINISHED_STATUSES\n\n @public\n @property\n def is_success(self) -> bool:\n """bool: If this run has successfully finished executing."""\n return self.status == DagsterRunStatus.SUCCESS\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this run has failed."""\n return self.status == DagsterRunStatus.FAILURE\n\n @public\n @property\n def is_failure_or_canceled(self) -> bool:\n """bool: If this run has either failed or was canceled."""\n return self.status == DagsterRunStatus.FAILURE or self.status == DagsterRunStatus.CANCELED\n\n @public\n @property\n def is_resume_retry(self) -> bool:\n """bool: If this run was created from retrying another run from the point of failure."""\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self) -> Optional[str]:\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule) -> Mapping[str, str]:\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor) -> Mapping[str, str]:\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id: str) -> Mapping[str, str]:\n return {BACKFILL_ID_TAG: backfill_id}
\n\n\nclass RunsFilterSerializer(NamedTupleSerializer["RunsFilter"]):\n def before_unpack(\n self,\n context,\n unpacked_dict: Dict[str, Any],\n ) -> Dict[str, Any]:\n # We store empty run ids as [] but only accept None\n if "run_ids" in unpacked_dict and unpacked_dict["run_ids"] == []:\n unpacked_dict["run_ids"] = None\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=RunsFilterSerializer,\n old_storage_names={"PipelineRunsFilter"},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", Sequence[str]),\n ("job_name", Optional[str]),\n ("statuses", Sequence[DagsterRunStatus]),\n ("tags", Mapping[str, Union[str, Sequence[str]]]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("updated_before", Optional[datetime]),\n ("created_after", Optional[datetime]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n """Defines a filter across job runs, for use when querying storage directly.\n\n Each field of the RunsFilter represents a logical AND with each other. For\n example, if you specify job_name and tags, then you will receive only runs\n with the specified job_name AND the specified tags. If left blank, then\n all values will be permitted for that field.\n\n Args:\n run_ids (Optional[List[str]]): A list of job run_id values.\n job_name (Optional[str]):\n Name of the job to query for. If blank, all job_names will be accepted.\n statuses (Optional[List[DagsterRunStatus]]):\n A list of run statuses to filter by. If blank, all run statuses will be allowed.\n tags (Optional[Dict[str, Union[str, List[str]]]]):\n A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.\n snapshot_id (Optional[str]): The ID of the job snapshot to query for. Intended for internal use.\n updated_after (Optional[DateTime]): Filter by runs that were last updated before this datetime.\n created_before (Optional[DateTime]): Filter by runs that were created before this datetime.\n\n """\n\n def __new__(\n cls,\n run_ids: Optional[Sequence[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[Sequence[DagsterRunStatus]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n updated_before: Optional[datetime] = None,\n created_after: Optional[datetime] = None,\n created_before: Optional[datetime] = None,\n ):\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_sequence_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_sequence_param(statuses, "statuses", of_type=DagsterRunStatus),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n updated_before=check.opt_inst_param(updated_before, "updated_before", datetime),\n created_after=check.opt_inst_param(created_after, "created_after", datetime),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @staticmethod\n def for_schedule(schedule: "ExternalSchedule") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_sensor(sensor: "ExternalSensor") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id: str) -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_backfill_id(backfill_id))
\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\n
[docs]class RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("dagster_run", DagsterRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n\n Users should not invoke this class directly.\n """\n\n def __new__(\n cls,\n storage_id: int,\n dagster_run: DagsterRun,\n create_timestamp: datetime,\n update_timestamp: datetime,\n start_time: Optional[float] = None,\n end_time: Optional[float] = None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )
\n\n\n@whitelist_for_serdes\nclass RunPartitionData(\n NamedTuple(\n "_RunPartitionData",\n [\n ("run_id", str),\n ("partition", str),\n ("status", DagsterRunStatus),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n partition: str,\n status: DagsterRunStatus,\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(RunPartitionData, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n partition=check.str_param(partition, "partition"),\n status=check.inst_param(status, "status", DagsterRunStatus),\n start_time=check.opt_inst(start_time, float),\n end_time=check.opt_inst(end_time, float),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[Sequence[str]])])\n):\n """Kept here to maintain loading of PipelineRuns from when it was still alive."""\n\n def __new__(cls, name: str, solid_subset: Optional[Sequence[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=(\n None\n if solid_subset is None\n else check.sequence_param(solid_subset, "solid_subset", of_type=str)\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/storage/dagster_run", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.dagster_run"}, "event_log": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.base

\nimport base64\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.event_api import EventHandlerFn, EventLogRecord, EventRecordsFilter\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.asset_check_execution_record import AssetCheckExecutionRecord\nfrom dagster._core.storage.dagster_run import DagsterRunStatsSnapshot\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._seven import json\nfrom dagster._utils import PrintFn\nfrom dagster._utils.concurrency import ConcurrencyClaimStatus, ConcurrencyKeyInfo\n\nif TYPE_CHECKING:\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n\nclass EventLogConnection(NamedTuple):\n    records: Sequence[EventLogRecord]\n    cursor: str\n    has_more: bool\n\n\nclass EventLogCursorType(Enum):\n    OFFSET = "OFFSET"\n    STORAGE_ID = "STORAGE_ID"\n\n\nclass EventLogCursor(NamedTuple):\n    """Representation of an event record cursor, keeping track of the log query state."""\n\n    cursor_type: EventLogCursorType\n    value: int\n\n    def is_offset_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.OFFSET\n\n    def is_id_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.STORAGE_ID\n\n    def offset(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.OFFSET)\n        return max(0, int(self.value))\n\n    def storage_id(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.STORAGE_ID)\n        return int(self.value)\n\n    def __str__(self) -> str:\n        return self.to_string()\n\n    def to_string(self) -> str:\n        raw = json.dumps({"type": self.cursor_type.value, "value": self.value})\n        return base64.b64encode(bytes(raw, encoding="utf-8")).decode("utf-8")\n\n    @staticmethod\n    def parse(cursor_str: str) -> "EventLogCursor":\n        raw = json.loads(base64.b64decode(cursor_str).decode("utf-8"))\n        return EventLogCursor(EventLogCursorType(raw["type"]), raw["value"])\n\n    @staticmethod\n    def from_offset(offset: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.OFFSET, offset)\n\n    @staticmethod\n    def from_storage_id(storage_id: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.STORAGE_ID, storage_id)\n\n\nclass AssetEntry(\n    NamedTuple(\n        "_AssetEntry",\n        [\n            ("asset_key", AssetKey),\n            ("last_materialization_record", Optional[EventLogRecord]),\n            ("last_run_id", Optional[str]),\n            ("asset_details", Optional[AssetDetails]),\n            ("cached_status", Optional["AssetStatusCacheValue"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        asset_key: AssetKey,\n        last_materialization_record: Optional[EventLogRecord] = None,\n        last_run_id: Optional[str] = None,\n        asset_details: Optional[AssetDetails] = None,\n        cached_status: Optional["AssetStatusCacheValue"] = None,\n    ):\n        from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n        return super(AssetEntry, cls).__new__(\n            cls,\n            asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n            last_materialization_record=check.opt_inst_param(\n                last_materialization_record, "last_materialization_record", EventLogRecord\n            ),\n            last_run_id=check.opt_str_param(last_run_id, "last_run_id"),\n            asset_details=check.opt_inst_param(asset_details, "asset_details", AssetDetails),\n            cached_status=check.opt_inst_param(\n                cached_status, "cached_status", AssetStatusCacheValue\n            ),\n        )\n\n    @property\n    def last_materialization(self) -> Optional["EventLogEntry"]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.event_log_entry\n\n    @property\n    def last_materialization_storage_id(self) -> Optional[int]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.storage_id\n\n\n
[docs]class AssetRecord(NamedTuple):\n """Internal representation of an asset record, as stored in a :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not invoke this class directly.\n """\n\n storage_id: int\n asset_entry: AssetEntry
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[Union[str, int]] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> Sequence["EventLogEntry"]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[Union[str, int]]): Cursor value to track paginated queries. Legacy\n support for integer offset cursors.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n if isinstance(cursor, int):\n cursor = EventLogCursor.from_offset(cursor + 1).to_string()\n records = self.get_records_for_run(\n run_id, cursor, of_type, limit, ascending=ascending\n ).records\n return [record.event_log_entry for record in records]\n\n @abstractmethod\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the event log records corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[str]): Cursor value to track paginated queries.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: "EventLogEntry") -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str) -> None:\n """Remove events for a given run id."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self) -> None:\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n pass\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, "EventLogEntry"]:\n """Get event records across all runs. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n def get_maximum_record_id(self) -> Optional[int]:\n """Get the current greatest record id in the event log. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n @abstractmethod\n def can_cache_asset_status_data(self) -> bool:\n pass\n\n @abstractmethod\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n pass\n\n @abstractmethod\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Sequence[AssetKey]:\n pass\n\n @abstractmethod\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n pass\n\n def supports_add_asset_event_tags(self) -> bool:\n return False\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n raise NotImplementedError()\n\n @abstractmethod\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey) -> None:\n """Remove asset index history from event log for given asset_key."""\n\n @abstractmethod\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n pass\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass\n\n @abstractmethod\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n pass\n\n @abstractmethod\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n pass\n\n @abstractmethod\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n pass\n\n @abstractmethod\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a dynamic partition exists."""\n raise NotImplementedError()\n\n @abstractmethod\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @property\n def is_run_sharded(self) -> bool:\n """Indicates that the EventLogStoarge is sharded."""\n return False\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n """Indicates that the EventLogStorage supports global concurrency limits."""\n return False\n\n @abstractmethod\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate concurrency slots for the given concurrency key."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get concurrency info for key."""\n raise NotImplementedError()\n\n @abstractmethod\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_run_ids(self) -> Set[str]:\n """Get a list of run_ids that are occupying or waiting for a concurrency key slot."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n """Frees concurrency slots for a given run."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n """Frees concurrency slots for a given run/step."""\n raise NotImplementedError()\n\n @property\n def supports_asset_checks(self):\n return True\n\n @abstractmethod\n def get_asset_check_execution_history(\n self,\n check_key: AssetCheckKey,\n limit: int,\n cursor: Optional[int] = None,\n ) -> Sequence[AssetCheckExecutionRecord]:\n """Get executions for one asset check, sorted by recency."""\n pass\n\n @abstractmethod\n def get_latest_asset_check_execution_by_key(\n self, check_keys: Sequence[AssetCheckKey]\n ) -> Mapping[AssetCheckKey, AssetCheckExecutionRecord]:\n """Get the latest executions for a list of asset checks."""\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sql_event_log

\nimport logging\nfrom abc import abstractmethod\nfrom collections import OrderedDict, defaultdict\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.errors import (\n    DagsterEventLogInvalidForRun,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.event_api import RunShardedEventsCursor\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, MARKER_EVENTS, DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.stats import RunStepKeyStatsSnapshot, build_run_step_stats_from_events\nfrom dagster._core.storage.asset_check_execution_record import (\n    AssetCheckExecutionRecord,\n    AssetCheckExecutionRecordStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_case,\n    db_fetch_mappings,\n    db_select,\n    db_subquery,\n)\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._utils import (\n    PrintFn,\n    datetime_as_float,\n    utc_datetime_from_naive,\n    utc_datetime_from_timestamp,\n)\nfrom dagster._utils.concurrency import (\n    ConcurrencyClaimStatus,\n    ConcurrencyKeyInfo,\n    ConcurrencySlotStatus,\n)\n\nfrom ..dagster_run import DagsterRunStatsSnapshot\nfrom .base import (\n    AssetEntry,\n    AssetRecord,\n    EventLogConnection,\n    EventLogCursor,\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import (\n    AssetCheckExecutionsTable,\n    AssetEventTagsTable,\n    AssetKeyTable,\n    ConcurrencySlotsTable,\n    DynamicPartitionsTable,\n    PendingStepsTable,\n    SecondaryIndexMigrationTable,\n    SqlEventLogStorageTable,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\nMAX_CONCURRENCY_SLOTS = 1000\nMIN_ASSET_ROWS = 25\n\n# We are using third-party library objects for DB connections-- at this time, these libraries are\n# untyped. When/if we upgrade to typed variants, the `Any` here can be replaced or the alias as a\n# whole can be dropped.\nSqlDbConnection: TypeAlias = Any\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id: Optional[str]) -> ContextManager[Connection]:\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self) -> ContextManager[Connection]:\n """Context manager yielding a connection to access cross-run indexed tables."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def has_table(self, table_name: str) -> bool:\n """This method checks if a table exists in the database."""\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values(\n run_id=event.run_id,\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_col(self, column_name: str) -> bool:\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return column_name in column_names\n\n def has_asset_key_index_cols(self) -> bool:\n return self.has_asset_key_col("last_materialization_timestamp")\n\n def store_asset_event(self, event: EventLogEntry, event_id: int):\n check.inst_param(event, "event", EventLogEntry)\n\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n values = self._get_asset_entry_values(event, event_id, self.has_asset_key_index_cols())\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(), **values\n )\n update_statement = (\n AssetKeyTable.update()\n .values(**values)\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db_exc.IntegrityError:\n conn.execute(update_statement)\n\n def _get_asset_entry_values(\n self, event: EventLogEntry, event_id: int, has_asset_key_index_cols: bool\n ) -> Dict[str, Any]:\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n entry_values: Dict[str, Any] = {}\n dagster_event = check.not_none(event.dagster_event)\n if dagster_event.is_step_materialization:\n entry_values.update(\n {\n "last_materialization": serialize_value(\n EventLogRecord(\n storage_id=event_id,\n event_log_entry=event,\n )\n ),\n "last_run_id": event.run_id,\n }\n )\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_materialization_planned:\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n entry_values.update({"last_run_id": event.run_id})\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_observation:\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n\n return entry_values\n\n def supports_add_asset_event_tags(self) -> bool:\n return self.has_table(AssetEventTagsTable.name)\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n check.int_param(event_id, "event_id")\n check.float_param(event_timestamp, "event_timestamp")\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n if not self.supports_add_asset_event_tags():\n raise DagsterInvalidInvocationError(\n "In order to add asset event tags, you must run `dagster instance migrate` to "\n "create the AssetEventTags table."\n )\n\n current_tags_list = self.get_event_tags_for_asset(asset_key, filter_event_id=event_id)\n\n asset_key_str = asset_key.to_string()\n\n if len(current_tags_list) == 0:\n current_tags: Mapping[str, str] = {}\n else:\n current_tags = current_tags_list[0]\n\n with self.index_connection() as conn:\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n AssetEventTagsTable.update()\n .where(\n db.and_(\n AssetEventTagsTable.c.event_id == event_id,\n AssetEventTagsTable.c.asset_key == asset_key_str,\n AssetEventTagsTable.c.key == tag,\n )\n )\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=tag,\n value=new_tags[tag],\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event_timestamp),\n )\n for tag in added_tags\n ],\n )\n\n def store_asset_event_tags(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.int_param(event_id, "event_id")\n\n if event.dagster_event and event.dagster_event.asset_key:\n if event.dagster_event.is_step_materialization:\n tags = event.dagster_event.step_materialization_data.materialization.tags\n elif event.dagster_event.is_asset_observation:\n tags = event.dagster_event.asset_observation_data.asset_observation.tags\n else:\n tags = None\n\n if not tags or not self.has_table(AssetEventTagsTable.name):\n # If tags table does not exist, silently exit. This is to support OSS\n # users who have not yet run the migration to create the table.\n # On read, we will throw an error if the table does not exist.\n return\n\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=key,\n value=value,\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n for key, value in tags.items()\n ],\n )\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n event_id = None\n\n with self.run_connection(run_id) as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def get_records_for_run(\n self,\n run_id,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.opt_str_param(cursor, "cursor")\n\n check.invariant(not of_type or isinstance(of_type, (DagsterEventType, frozenset, set)))\n\n dagster_event_types = (\n {of_type}\n if isinstance(of_type, DagsterEventType)\n else check.opt_set_param(of_type, "dagster_event_type", of_type=DagsterEventType)\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(\n SqlEventLogStorageTable.c.id.asc()\n if ascending\n else SqlEventLogStorageTable.c.id.desc()\n )\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n if cursor is not None:\n cursor_obj = EventLogCursor.parse(cursor)\n if cursor_obj.is_offset_cursor():\n query = query.offset(cursor_obj.offset())\n elif cursor_obj.is_id_cursor():\n if ascending:\n query = query.where(SqlEventLogStorageTable.c.id > cursor_obj.storage_id())\n else:\n query = query.where(SqlEventLogStorageTable.c.id < cursor_obj.storage_id())\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n last_record_id = None\n try:\n records = []\n for (\n record_id,\n json_str,\n ) in results:\n records.append(\n EventLogRecord(\n storage_id=record_id,\n event_log_entry=deserialize_value(json_str, EventLogEntry),\n )\n )\n last_record_id = record_id\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n if last_record_id is not None:\n next_cursor = EventLogCursor.from_storage_id(last_record_id).to_string()\n elif cursor:\n # record fetch returned no new logs, return the same cursor\n next_cursor = cursor\n else:\n # rely on the fact that all storage ids will be positive integers\n next_cursor = EventLogCursor.from_storage_id(-1).to_string()\n\n return EventLogConnection(\n records=records,\n cursor=next_cursor,\n has_more=bool(limit and len(results) == limit),\n )\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n check.str_param(run_id, "run_id")\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None, # noqa: E711\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return DagsterRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db_select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None) # noqa: E711\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n ]\n + [marker_event.value for marker_event in MARKER_EVENTS]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [deserialize_value(json_str, EventLogEntry) for (json_str,) in results]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log table."""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset_keys table."""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self) -> None:\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n self._wipe_index()\n\n def _wipe_index(self):\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n self.free_concurrency_slots_for_run(run_id)\n\n def delete_events_for_run(self, conn: Connection, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n conn.execute(\n SqlEventLogStorageTable.delete().where(SqlEventLogStorageTable.c.run_id == run_id)\n )\n\n @property\n def is_persistent(self) -> bool:\n return True\n\n def update_event_log_record(self, record_id: int, event: EventLogEntry) -> None:\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value # type: ignore\n if event.dagster_event.asset_key: # type: ignore\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey) # type: ignore\n asset_key_str = event.dagster_event.asset_key.to_string() # type: ignore\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update()\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id: str, record_id: int) -> Optional[SqlAlchemyRow]:\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering).\n """\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db_select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name: str) -> bool:\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name: str) -> None:\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = SecondaryIndexMigrationTable.insert().values(\n name=name,\n migration_completed=datetime.now(),\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n event_records_filter: EventRecordsFilter,\n asset_details: Optional[AssetDetails] = None,\n apply_cursor_filters: bool = True,\n ) -> SqlAlchemyQuery:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n SqlEventLogStorageTable.c.asset_key == event_records_filter.asset_key.to_string(),\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor_id)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n if event_records_filter.storage_ids:\n query = query.where(SqlEventLogStorageTable.c.id.in_(event_records_filter.storage_ids))\n\n if event_records_filter.tags and self.has_table(AssetEventTagsTable.name):\n # If we don't have the tags table, we'll filter the results after the query\n check.invariant(\n isinstance(event_records_filter.asset_key, AssetKey),\n "Asset key must be set in event records filter to filter by tags.",\n )\n if self.supports_intersect:\n intersections = [\n db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key\n == event_records_filter.asset_key.to_string(), # type: ignore # (bad sig?)\n AssetEventTagsTable.c.key == key,\n (\n AssetEventTagsTable.c.value == value\n if isinstance(value, str)\n else AssetEventTagsTable.c.value.in_(value)\n ),\n )\n )\n for key, value in event_records_filter.tags.items()\n ]\n query = query.where(SqlEventLogStorageTable.c.id.in_(db.intersect(*intersections)))\n\n return query\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n asset_key: Optional[AssetKey],\n ) -> db.Table:\n event_id_col = table.c.id if table == SqlEventLogStorageTable else table.c.event_id\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = db_subquery(\n db_select([AssetEventTagsTable]), f"asset_event_tags_subquery_{i}"\n )\n table = table.join(\n tags_table,\n db.and_(\n event_id_col == tags_table.c.event_id,\n not asset_key or tags_table.c.asset_key == asset_key.to_string(),\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if (\n event_records_filter.tags\n and not self.supports_intersect\n and self.has_table(AssetEventTagsTable.name)\n ):\n table = self._apply_tags_table_joins(\n SqlEventLogStorageTable, event_records_filter.tags, event_records_filter.asset_key\n )\n else:\n table = SqlEventLogStorageTable\n\n query = db_select(\n [SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]\n ).select_from(table)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, NamedTuple)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n\n if event_records_filter.tags and not self.has_table(AssetEventTagsTable.name):\n # If we can't filter tags via the tags table, filter the returned records\n if limit is not None:\n raise DagsterInvalidInvocationError(\n "Cannot filter events on tags with a limit, without the asset event "\n "tags table. To fix, run `dagster instance migrate`."\n )\n\n event_record_tags = event_record.tags\n if not event_record_tags or any(\n event_record_tags.get(k) != v for k, v in event_records_filter.tags.items()\n ):\n continue\n\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n return event_records\n\n def supports_event_consumer_queries(self) -> bool:\n return True\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, EventLogEntry]:\n check.int_param(after_cursor, "after_cursor")\n check.invariant(\n after_cursor >= -1,\n f"Don't know what to do with negative cursor {after_cursor}",\n )\n dagster_event_types = (\n {dagster_event_type}\n if isinstance(dagster_event_type, DagsterEventType)\n else check.opt_set_param(\n dagster_event_type, "dagster_event_type", of_type=DagsterEventType\n )\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.id > after_cursor)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n if limit:\n query = query.limit(limit)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n record_id = None\n try:\n for (\n record_id,\n json_str,\n ) in results:\n events[record_id] = deserialize_value(json_str, EventLogEntry)\n except (seven.JSONDecodeError, DeserializationError):\n logging.warning("Could not parse event record id `%s`.", record_id)\n\n return events\n\n def get_maximum_record_id(self) -> Optional[int]:\n with self.index_connection() as conn:\n result = conn.execute(db_select([db.func.max(SqlEventLogStorageTable.c.id)])).fetchone()\n return result[0] # type: ignore\n\n def _construct_asset_record_from_row(\n self,\n row,\n last_materialization_record: Optional[EventLogRecord],\n can_cache_asset_status_data: bool,\n ) -> AssetRecord:\n from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n return AssetRecord(\n storage_id=row["id"],\n asset_entry=AssetEntry(\n asset_key=asset_key,\n last_materialization_record=last_materialization_record,\n last_run_id=row["last_run_id"],\n asset_details=AssetDetails.from_db_string(row["asset_details"]),\n cached_status=(\n AssetStatusCacheValue.from_db_string(row["cached_status_data"])\n if can_cache_asset_status_data\n else None\n ),\n ),\n )\n else:\n check.failed("Row did not contain asset key.")\n\n def _get_latest_materialization_records(\n self, raw_asset_rows\n ) -> Mapping[AssetKey, Optional[EventLogRecord]]:\n # Given a list of raw asset rows, returns a mapping of asset key to latest asset materialization\n # event log entry. Fetches backcompat EventLogEntry records when the last_materialization\n # in the raw asset row is an AssetMaterialization.\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogRecord]] = {}\n for row in raw_asset_rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_value(row["last_materialization"], NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(event_or_materialization, EventLogRecord):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n latest_event_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key),\n "latest_event_subquery",\n )\n backcompat_query = db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.id,\n SqlEventLogStorageTable.c.event,\n ]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.id == latest_event_subquery.c.id,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = db_fetch_mappings(conn, backcompat_query)\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row["asset_key"]))\n if asset_key:\n results[asset_key] = EventLogRecord(\n storage_id=cast(int, row["id"]),\n event_log_entry=deserialize_value(cast(str, row["event"]), EventLogEntry),\n )\n return results\n\n def can_cache_asset_status_data(self) -> bool:\n return self.has_asset_key_col("cached_status_data")\n\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n if self.can_cache_asset_status_data():\n check.inst_param(asset_key, "asset_key", AssetKey)\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(dict(cached_status_data=None))\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n latest_materialization_records = self._get_latest_materialization_records(rows)\n can_cache_asset_status_data = self.can_cache_asset_status_data()\n\n asset_records: List[AssetRecord] = []\n for row in rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n asset_records.append(\n self._construct_asset_record_from_row(\n row,\n latest_materialization_records.get(asset_key),\n can_cache_asset_status_data,\n )\n )\n\n return asset_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.iterable_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n return {\n asset_key: event_log_record.event_log_entry if event_log_record is not None else None\n for asset_key, event_log_record in self._get_latest_materialization_records(\n rows\n ).items()\n }\n\n def _fetch_asset_rows(\n self,\n asset_keys=None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[SqlAlchemyRow]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows): # type: ignore\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor=None,\n ) -> Tuple[Iterable[SqlAlchemyRow], bool, Optional[str]]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n # TODO update comment\n\n columns = [\n AssetKeyTable.c.id,\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.last_run_id,\n AssetKeyTable.c.asset_details,\n ]\n if self.can_cache_asset_status_data():\n columns.extend([AssetKeyTable.c.cached_status_data])\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db_select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp.is_(None),\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n wiped_timestamps_by_asset_key: Dict[AssetKey, float] = {}\n row_by_asset_key: Dict[AssetKey, SqlAlchemyRow] = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(cast(str, row["asset_key"]))\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row["asset_details"])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event_or_record = (\n deserialize_value(cast(str, row["last_materialization"]), NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(materialization_or_event_or_record, (EventLogRecord, EventLogEntry)):\n if isinstance(materialization_or_event_or_record, EventLogRecord):\n event_timestamp = materialization_or_event_or_record.event_log_entry.timestamp\n else:\n event_timestamp = materialization_or_event_or_record.timestamp\n\n if asset_details.last_wipe_timestamp > event_timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys() # type: ignore\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1]["id"] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor # type: ignore\n\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n if self.can_cache_asset_status_data():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n .values(cached_status_data=serialize_value(cache_values))\n )\n\n def _fetch_backcompat_materialization_times(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, datetime]:\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = db_fetch_mappings(conn, backcompat_query)\n return {AssetKey.from_db_string(row["asset_key"]): row["timestamp"] for row in backcompat_rows} # type: ignore\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix=None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> SqlAlchemyQuery:\n if asset_keys is not None:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(\n self, asset_keys: Sequence[AssetKey]\n ) -> Sequence[Optional[AssetDetails]]:\n check.sequence_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = db_fetch_mappings(\n conn,\n db_select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n ),\n )\n\n asset_key_to_details = {\n cast(str, row["asset_key"]): (\n deserialize_value(cast(str, row["asset_details"]), AssetDetails)\n if row["asset_details"]\n else None\n )\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n assets_details: Sequence[Optional[AssetDetails]],\n asset_keys: Sequence[AssetKey],\n ) -> SqlAlchemyQuery:\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp:\n asset_key_in_row = SqlEventLogStorageTable.c.asset_key == asset_key.to_string()\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp),\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, fetches only tags applied to the given event.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n filter_tags = check.opt_mapping_param(\n filter_tags, "filter_tags", key_type=str, value_type=str\n )\n filter_event_id = check.opt_int_param(filter_event_id, "filter_event_id")\n\n if not self.has_table(AssetEventTagsTable.name):\n raise DagsterInvalidInvocationError(\n "In order to search for asset event tags, you must run "\n "`dagster instance migrate` to create the AssetEventTags table."\n )\n\n asset_details = self._get_assets_details([asset_key])[0]\n if not filter_tags:\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(AssetEventTagsTable.c.asset_key == asset_key.to_string())\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n elif self.supports_intersect:\n\n def get_tag_filter_query(tag_key, tag_value):\n filter_query = db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key == asset_key.to_string(),\n AssetEventTagsTable.c.key == tag_key,\n AssetEventTagsTable.c.value == tag_value,\n )\n )\n if asset_details and asset_details.last_wipe_timestamp:\n filter_query = filter_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n return filter_query\n\n intersections = [\n get_tag_filter_query(tag_key, tag_value)\n for tag_key, tag_value in filter_tags.items()\n ]\n\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(\n db.and_(\n AssetEventTagsTable.c.event_id.in_(db.intersect(*intersections)),\n )\n )\n else:\n table = self._apply_tags_table_joins(AssetEventTagsTable, filter_tags, asset_key)\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).select_from(table)\n\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if filter_event_id is not None:\n tags_query = tags_query.where(AssetEventTagsTable.c.event_id == filter_event_id)\n\n with self.index_connection() as conn:\n results = conn.execute(tags_query).fetchall()\n\n tags_by_event_id: Dict[int, Dict[str, str]] = defaultdict(dict)\n for row in results:\n key, value, event_id = row\n tags_by_event_id[event_id][key] = value\n\n return list(tags_by_event_id.values())\n\n def _asset_materialization_from_json_column(\n self, json_str: str\n ) -> Optional[AssetMaterialization]:\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_value(json_str, NamedTuple)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key # type: ignore\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization # type: ignore\n\n def _get_asset_key_values_on_wipe(self) -> Mapping[str, Any]:\n wipe_timestamp = pendulum.now("UTC").timestamp()\n values = {\n "asset_details": serialize_value(AssetDetails(last_wipe_timestamp=wipe_timestamp)),\n "last_run_id": None,\n }\n if self.has_asset_key_index_cols():\n values.update(\n dict(\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n )\n )\n if self.can_cache_asset_status_data():\n values.update(dict(cached_status_data=None))\n return values\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n check.inst_param(asset_key, "asset_key", AssetKey)\n wiped_values = self._get_asset_key_values_on_wipe()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(**wiped_values)\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details([asset_key])\n query = self._add_assets_wipe_filter_to_query(query, assets_details, [asset_key])\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n if before_cursor:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return set([cast(str, row[0]) for row in results])\n\n def get_materialization_count_by_partition(\n self,\n asset_keys: Sequence[AssetKey],\n after_cursor: Optional[int] = None,\n before_cursor: Optional[int] = None,\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.sequence_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row[0]))\n if asset_key:\n materialization_count_by_partition[asset_key][cast(str, row[1])] = cast(int, row[2])\n\n return materialization_count_by_partition\n\n def _latest_event_ids_by_partition_subquery(\n self,\n asset_key: AssetKey,\n event_types: Sequence[DagsterEventType],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ):\n """Subquery for locating the latest event ids by partition for a given asset key and set\n of event types.\n """\n query = db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n ).where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [event_type.value for event_type in event_types]\n ),\n )\n )\n if asset_partitions is not None:\n query = query.where(SqlEventLogStorageTable.c.partition.in_(asset_partitions))\n if before_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n if after_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n latest_event_ids_subquery = query.group_by(\n SqlEventLogStorageTable.c.dagster_event_type, SqlEventLogStorageTable.c.partition\n )\n\n assets_details = self._get_assets_details([asset_key])\n return db_subquery(\n self._add_assets_wipe_filter_to_query(\n latest_event_ids_subquery, assets_details, [asset_key]\n ),\n "latest_event_ids_by_partition_subquery",\n )\n\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_by_partition_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key, [event_type]\n )\n latest_event_ids_by_partition = db_select(\n [\n latest_event_ids_by_partition_subquery.c.partition,\n latest_event_ids_by_partition_subquery.c.id,\n ]\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(latest_event_ids_by_partition).fetchall()\n\n latest_materialization_storage_id_by_partition: Dict[str, int] = {}\n for row in rows:\n latest_materialization_storage_id_by_partition[cast(str, row[0])] = cast(int, row[1])\n return latest_materialization_storage_id_by_partition\n\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.inst_param(event_type, "event_type", DagsterEventType)\n check.sequence_param(tag_keys, "tag_keys", of_type=str)\n check.opt_nullable_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.opt_int_param(before_cursor, "before_cursor")\n check.opt_int_param(after_cursor, "after_cursor")\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key=asset_key,\n event_types=[event_type],\n asset_partitions=asset_partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n )\n\n latest_tags_by_partition_query = (\n db_select(\n [\n latest_event_ids_subquery.c.partition,\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n ]\n )\n .select_from(\n latest_event_ids_subquery.join(\n AssetEventTagsTable,\n AssetEventTagsTable.c.event_id == latest_event_ids_subquery.c.id,\n )\n )\n .where(AssetEventTagsTable.c.key.in_(tag_keys))\n )\n\n latest_tags_by_partition: Dict[str, Dict[str, str]] = defaultdict(dict)\n with self.index_connection() as conn:\n rows = conn.execute(latest_tags_by_partition_query).fetchall()\n\n for row in rows:\n latest_tags_by_partition[cast(str, row[0])][cast(str, row[1])] = cast(str, row[2])\n\n # convert defaultdict to dict\n return dict(latest_tags_by_partition)\n\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n """Fetch the latest materialzation and materialization planned events for each partition of the given asset.\n Return the partitions that have a materialization planned event but no matching (same run) materialization event.\n These materializations could be in progress, or they could have failed. A separate query checking the run status\n is required to know.\n\n Returns a mapping of partition to [run id, event id].\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key,\n [\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n ],\n )\n\n latest_events_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n SqlEventLogStorageTable.c.run_id,\n SqlEventLogStorageTable.c.id,\n ]\n ).select_from(\n latest_event_ids_subquery.join(\n SqlEventLogStorageTable,\n SqlEventLogStorageTable.c.id == latest_event_ids_subquery.c.id,\n ),\n ),\n "latest_events_subquery",\n )\n\n materialization_planned_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n latest_events_subquery.c.id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value\n )\n\n materialization_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value\n )\n\n with self.index_connection() as conn:\n materialization_planned_rows = db_fetch_mappings(conn, materialization_planned_events)\n materialization_rows = db_fetch_mappings(conn, materialization_events)\n\n materialization_planned_rows_by_partition = {\n cast(str, row["partition"]): (cast(str, row["run_id"]), cast(int, row["id"]))\n for row in materialization_planned_rows\n }\n for row in materialization_rows:\n if (\n row["partition"] in materialization_planned_rows_by_partition\n and materialization_planned_rows_by_partition[cast(str, row["partition"])][0]\n == row["run_id"]\n ):\n materialization_planned_rows_by_partition.pop(cast(str, row["partition"]))\n\n return materialization_planned_rows_by_partition\n\n def _check_partitions_table(self) -> None:\n # Guards against cases where the user is not running the latest migration for\n # partitions storage. Should be updated when the partitions storage schema changes.\n if not self.has_table("dynamic_partitions"):\n raise DagsterInvalidInvocationError(\n "Using dynamic partitions definitions requires the dynamic partitions table, which"\n " currently does not exist. Add this table by running `dagster"\n " instance migrate`."\n )\n\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a partition definition."""\n self._check_partitions_table()\n columns = [\n DynamicPartitionsTable.c.partitions_def_name,\n DynamicPartitionsTable.c.partition,\n ]\n query = (\n db_select(columns)\n .where(DynamicPartitionsTable.c.partitions_def_name == partitions_def_name)\n .order_by(DynamicPartitionsTable.c.id)\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [cast(str, row[1]) for row in rows]\n\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n self._check_partitions_table()\n query = (\n db_select([DynamicPartitionsTable.c.partition])\n .where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n existing_rows = conn.execute(\n db_select([DynamicPartitionsTable.c.partition]).where(\n db.and_(\n DynamicPartitionsTable.c.partition.in_(partition_keys),\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n )\n )\n ).fetchall()\n existing_keys = set([row[0] for row in existing_rows])\n new_keys = [\n partition_key\n for partition_key in partition_keys\n if partition_key not in existing_keys\n ]\n\n if new_keys:\n conn.execute(\n DynamicPartitionsTable.insert(),\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in new_keys\n ],\n )\n\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n DynamicPartitionsTable.delete().where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n )\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return self.has_table(ConcurrencySlotsTable.name)\n\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate a set of concurrency slots.\n\n Args:\n concurrency_key (str): The key to allocate the slots for.\n num (int): The number of slots to allocate.\n """\n if num > MAX_CONCURRENCY_SLOTS:\n raise DagsterInvalidInvocationError(\n f"Cannot have more than {MAX_CONCURRENCY_SLOTS} slots per concurrency key."\n )\n if num < 0:\n raise DagsterInvalidInvocationError("Cannot have a negative number of slots.")\n\n keys_to_assign = None\n with self.index_connection() as conn:\n count_row = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n existing = cast(int, count_row[0]) if count_row else 0\n\n if existing > num:\n # need to delete some slots, favoring ones where the slot is unallocated\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .order_by(\n db_case([(ConcurrencySlotsTable.c.run_id.is_(None), 1)], else_=0).desc(),\n ConcurrencySlotsTable.c.id.desc(),\n )\n .limit(existing - num)\n ).fetchall()\n\n if rows:\n # mark rows as deleted\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(deleted=True)\n .where(ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]))\n )\n\n # actually delete rows that are marked as deleted and are not claimed... the rest\n # will be deleted when the slots are released by the free_concurrency_slots\n conn.execute(\n ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n ConcurrencySlotsTable.c.run_id == None, # noqa: E711\n )\n )\n )\n elif num > existing:\n # need to add some slots\n rows = [\n {\n "concurrency_key": concurrency_key,\n "run_id": None,\n "step_key": None,\n "deleted": False,\n }\n for _ in range(existing, num)\n ]\n conn.execute(ConcurrencySlotsTable.insert().values(rows))\n keys_to_assign = [concurrency_key for _ in range(existing, num)]\n\n if keys_to_assign:\n # we've added some slots... if there are any pending steps, we can assign them now or\n # they will be unutilized until free_concurrency_slots is called\n self.assign_pending_steps(keys_to_assign)\n\n def has_unassigned_slots(self, concurrency_key: str) -> bool:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.assigned_timestamp != None, # noqa: E711\n )\n )\n ).fetchone()\n slots = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n pending_count = cast(int, pending_row[0]) if pending_row else 0\n slots_count = cast(int, slots[0]) if slots else 0\n return slots_count > pending_count\n\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select(\n [\n PendingStepsTable.c.assigned_timestamp,\n PendingStepsTable.c.priority,\n PendingStepsTable.c.create_timestamp,\n ]\n ).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n\n if not pending_row:\n # no pending step pending_row exists, the slot is blocked and the enqueued timestamp is None\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=None,\n assigned_timestamp=None,\n enqueued_timestamp=None,\n )\n\n priority = cast(int, pending_row[1]) if pending_row[1] else None\n assigned_timestamp = cast(datetime, pending_row[0]) if pending_row[0] else None\n create_timestamp = cast(datetime, pending_row[2]) if pending_row[2] else None\n if assigned_timestamp is None:\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=priority,\n assigned_timestamp=None,\n enqueued_timestamp=create_timestamp,\n )\n\n # pending step is assigned, check to see if it's been claimed\n slot_row = conn.execute(\n db_select([db.func.count()]).where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=(\n ConcurrencySlotStatus.CLAIMED\n if slot_row and slot_row[0]\n else ConcurrencySlotStatus.BLOCKED\n ),\n priority=priority,\n assigned_timestamp=assigned_timestamp,\n enqueued_timestamp=create_timestamp,\n )\n\n def can_claim_from_pending(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([PendingStepsTable.c.assigned_timestamp]).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n return row and row[0] is not None\n\n def has_pending_step(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n return row and cast(int, row[0]) > 0\n\n def assign_pending_steps(self, concurrency_keys: Sequence[str]):\n if not concurrency_keys:\n return\n\n with self.index_connection() as conn:\n for key in concurrency_keys:\n row = conn.execute(\n db_select([PendingStepsTable.c.id])\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == key,\n PendingStepsTable.c.assigned_timestamp == None, # noqa: E711\n )\n )\n .order_by(\n PendingStepsTable.c.priority.desc(),\n PendingStepsTable.c.create_timestamp.asc(),\n )\n .limit(1)\n ).fetchone()\n if row:\n conn.execute(\n PendingStepsTable.update()\n .where(PendingStepsTable.c.id == row[0])\n .values(assigned_timestamp=db.func.now())\n )\n\n def add_pending_step(\n self,\n concurrency_key: str,\n run_id: str,\n step_key: str,\n priority: Optional[int] = None,\n should_assign: bool = False,\n ):\n with self.index_connection() as conn:\n try:\n conn.execute(\n PendingStepsTable.insert().values(\n [\n dict(\n run_id=run_id,\n step_key=step_key,\n concurrency_key=concurrency_key,\n priority=priority or 0,\n assigned_timestamp=db.func.now() if should_assign else None,\n )\n ]\n )\n )\n except db_exc.IntegrityError:\n # do nothing\n pass\n\n def _remove_pending_steps(self, run_id: str, step_key: Optional[str] = None):\n query = PendingStepsTable.delete().where(PendingStepsTable.c.run_id == run_id)\n if step_key:\n query = query.where(PendingStepsTable.c.step_key == step_key)\n with self.index_connection() as conn:\n conn.execute(query)\n\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slot for step.\n\n Args:\n concurrency_keys (str): The concurrency key to claim.\n run_id (str): The run id to claim for.\n step_key (str): The step key to claim for.\n """\n # first, register the step by adding to pending queue\n if not self.has_pending_step(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n ):\n has_unassigned_slots = self.has_unassigned_slots(concurrency_key)\n self.add_pending_step(\n concurrency_key=concurrency_key,\n run_id=run_id,\n step_key=step_key,\n priority=priority,\n should_assign=has_unassigned_slots,\n )\n\n # if the step is not assigned (i.e. has not been popped from queue), block the claim\n claim_status = self.check_concurrency_claim(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n if claim_status.is_claimed or not claim_status.is_assigned:\n return claim_status\n\n # attempt to claim a concurrency slot... this should generally work because we only assign\n # based on the number of unclaimed slots, but this should act as a safeguard, using the slot\n # rows as a semaphore\n slot_status = self._claim_concurrency_slot(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n return claim_status.with_slot_status(slot_status)\n\n def _claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencySlotStatus:\n """Claim a concurrency slot for the step. Helper method that is called for steps that are\n popped off the priority queue.\n\n Args:\n concurrency_key (str): The concurrency key to claim.\n run_id (str): The run id to claim a slot for.\n step_key (str): The step key to claim a slot for.\n """\n with self.index_connection() as conn:\n result = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.step_key == None, # noqa: E711\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .with_for_update(skip_locked=True)\n .limit(1)\n ).fetchone()\n if not result or not result[0]:\n return ConcurrencySlotStatus.BLOCKED\n if not conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=run_id, step_key=step_key)\n .where(ConcurrencySlotsTable.c.id == result[0])\n ).rowcount:\n return ConcurrencySlotStatus.BLOCKED\n\n return ConcurrencySlotStatus.CLAIMED\n\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n with self.index_connection() as conn:\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.deleted == False) # noqa: E712\n .distinct()\n ).fetchall()\n return {cast(str, row[0]) for row in rows}\n\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get the list of concurrency slots for a given concurrency key.\n\n Args:\n concurrency_key (str): The concurrency key to get the slots for.\n\n Returns:\n List[Tuple[str, int]]: A list of tuples of run_id and the number of slots it is\n occupying for the given concurrency key.\n """\n with self.index_connection() as conn:\n slot_query = (\n db_select(\n [\n ConcurrencySlotsTable.c.run_id,\n ConcurrencySlotsTable.c.deleted,\n db.func.count().label("count"),\n ]\n )\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.concurrency_key == concurrency_key)\n .group_by(ConcurrencySlotsTable.c.run_id, ConcurrencySlotsTable.c.deleted)\n )\n slot_rows = db_fetch_mappings(conn, slot_query)\n pending_query = (\n db_select(\n [\n PendingStepsTable.c.run_id,\n db_case(\n [(PendingStepsTable.c.assigned_timestamp.is_(None), False)],\n else_=True,\n ).label("is_assigned"),\n db.func.count().label("count"),\n ]\n )\n .select_from(PendingStepsTable)\n .where(PendingStepsTable.c.concurrency_key == concurrency_key)\n .group_by(PendingStepsTable.c.run_id, "is_assigned")\n )\n pending_rows = db_fetch_mappings(conn, pending_query)\n\n return ConcurrencyKeyInfo(\n concurrency_key=concurrency_key,\n slot_count=sum(\n [\n cast(int, slot_row["count"])\n for slot_row in slot_rows\n if not slot_row["deleted"]\n ]\n ),\n active_slot_count=sum(\n [cast(int, slot_row["count"]) for slot_row in slot_rows if slot_row["run_id"]]\n ),\n active_run_ids={\n cast(str, slot_row["run_id"]) for slot_row in slot_rows if slot_row["run_id"]\n },\n pending_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if not row["is_assigned"]]\n ),\n pending_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if not row["is_assigned"]\n },\n assigned_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if row["is_assigned"]]\n ),\n assigned_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if row["is_assigned"]\n },\n )\n\n def get_concurrency_run_ids(self) -> Set[str]:\n with self.index_connection() as conn:\n rows = conn.execute(db_select([PendingStepsTable.c.run_id]).distinct()).fetchall()\n return set([cast(str, row[0]) for row in rows])\n\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id)\n self._remove_pending_steps(run_id=run_id)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id, step_key=step_key)\n self._remove_pending_steps(run_id=run_id, step_key=step_key)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def _free_concurrency_slots(self, run_id: str, step_key: Optional[str] = None) -> Sequence[str]:\n """Frees concurrency slots for a given run/step.\n\n Args:\n run_id (str): The run id to free the slots for.\n step_key (Optional[str]): The step key to free the slots for. If not provided, all the\n slots for all the steps of the run will be freed.\n """\n with self.index_connection() as conn:\n # first delete any rows that apply and are marked as deleted. This happens when the\n # configured number of slots has been reduced, and some of the pruned slots included\n # ones that were already allocated to the run/step\n delete_query = ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n )\n )\n if step_key:\n delete_query = delete_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n conn.execute(delete_query)\n\n # next, fetch the slots to free up, while grabbing the concurrency keys so that we can\n # allocate any pending steps from the queue for the freed slots, if necessary\n select_query = (\n db_select([ConcurrencySlotsTable.c.id, ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.run_id == run_id)\n .with_for_update(skip_locked=True)\n )\n if step_key:\n select_query = select_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n rows = conn.execute(select_query).fetchall()\n if not rows:\n return []\n\n # now, actually free the slots\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=None, step_key=None)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]),\n )\n )\n )\n\n # return the concurrency keys for the freed slots\n return [cast(str, row[1]) for row in rows]\n\n def store_asset_check_event(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.opt_int_param(event_id, "event_id")\n\n check.invariant(\n self.supports_asset_checks,\n "Asset checks require a database schema migration. Run `dagster instance migrate`.",\n )\n\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n self._store_asset_check_evaluation_planned(event, event_id)\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n if event.run_id == "" or event.run_id is None:\n self._store_runless_asset_check_evaluation(event, event_id)\n else:\n self._update_asset_check_evaluation(event, event_id)\n\n def _store_asset_check_evaluation_planned(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n planned = cast(\n AssetCheckEvaluationPlanned, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=planned.asset_key.to_string(),\n check_name=planned.check_name,\n run_id=event.run_id,\n execution_status=AssetCheckExecutionRecordStatus.PLANNED.value,\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n )\n\n def _store_runless_asset_check_evaluation(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=evaluation.asset_key.to_string(),\n check_name=evaluation.check_name,\n run_id=event.run_id,\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n )\n\n def _update_asset_check_evaluation(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n rows_updated = conn.execute(\n AssetCheckExecutionsTable.update()\n .where(\n # (asset_key, check_name, run_id) uniquely identifies the row created for the planned event\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == evaluation.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == evaluation.check_name,\n AssetCheckExecutionsTable.c.run_id == event.run_id,\n )\n )\n .values(\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n ).rowcount\n if rows_updated != 1:\n raise DagsterInvariantViolationError(\n "Expected to update one row for asset check evaluation, but updated"\n f" {rows_updated}."\n )\n\n def get_asset_check_execution_history(\n self,\n check_key: AssetCheckKey,\n limit: int,\n cursor: Optional[int] = None,\n ) -> Sequence[AssetCheckExecutionRecord]:\n check.inst_param(check_key, "key", AssetCheckKey)\n check.int_param(limit, "limit")\n check.opt_int_param(cursor, "cursor")\n\n query = (\n db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == check_key.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == check_key.name,\n )\n )\n .order_by(AssetCheckExecutionsTable.c.id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetCheckExecutionsTable.c.id < cursor)\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return [AssetCheckExecutionRecord.from_db_row(row) for row in rows]\n\n def get_latest_asset_check_execution_by_key(\n self, check_keys: Sequence[AssetCheckKey]\n ) -> Mapping[AssetCheckKey, AssetCheckExecutionRecord]:\n if not check_keys:\n return {}\n\n latest_ids_subquery = db_subquery(\n db_select(\n [\n db.func.max(AssetCheckExecutionsTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key.in_(\n [key.asset_key.to_string() for key in check_keys]\n ),\n AssetCheckExecutionsTable.c.check_name.in_([key.name for key in check_keys]),\n )\n )\n .group_by(\n AssetCheckExecutionsTable.c.asset_key,\n AssetCheckExecutionsTable.c.check_name,\n )\n )\n\n query = db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.asset_key,\n AssetCheckExecutionsTable.c.check_name,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n ).select_from(\n AssetCheckExecutionsTable.join(\n latest_ids_subquery,\n db.and_(\n AssetCheckExecutionsTable.c.id == latest_ids_subquery.c.id,\n ),\n )\n )\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return {\n AssetCheckKey(\n asset_key=check.not_none(AssetKey.from_db_string(cast(str, row["asset_key"]))),\n name=cast(str, row["check_name"]),\n ): AssetCheckExecutionRecord.from_db_row(row)\n for row in rows\n }\n\n @property\n def supports_asset_checks(self):\n return self.has_table(AssetCheckExecutionsTable.name)
\n\n\ndef _get_from_row(row: SqlAlchemyRow, column: str) -> object:\n """Utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3.\n """\n if column not in row.keys():\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sql_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nfrom dagster._config import StringSource\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlDbConnection, SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def run_connection(self, run_id: Optional[str]) -> SqlDbConnection:\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n return bool(engine.dialect.has_table(engine.connect(), table_name))\n\n def get_db_path(self):\n return os.path.join(self._base_dir, f"{SQLITE_EVENT_LOG_FILENAME}.db")\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n self._watchers[run_id][callback] = cursor\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n connection = self.get_records_for_run(run_id, cursor)\n\n # update cursor\n if connection.cursor:\n self._watchers[run_id][callback] = connection.cursor\n\n for record in connection.records:\n status = None\n try:\n status = callback(\n record.event_log_entry,\n str(EventLogCursor.from_storage_id(record.storage_id)),\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.sqlite_event_log

\nimport contextlib\nimport glob\nimport logging\nimport os\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, ContextManager, Iterable, Iterator, Optional, Sequence\n\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection, Engine\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import FileSystemEvent, PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._config import StringSource\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, EVENT_TYPE_TO_PIPELINE_RUN_STATUS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.dagster_run import DagsterRunStatus, RunsFilter\nfrom dagster._core.storage.event_log.base import EventLogCursor, EventLogRecord, EventRecordsFilter\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file insqliteve\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster._core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database.\n """\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self) -> None:\n all_run_ids = self.get_all_run_ids()\n print(f"Updating event log storage for {len(all_run_ids)} runs on disk...") # noqa: T201\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # noqa: T201\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteEventLogStorage":\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self) -> Sequence[str]:\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def has_table(self, table_name: str) -> bool:\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n return bool(engine.dialect.has_table(conn, table_name))\n\n def path_for_shard(self, run_id: str) -> str:\n return os.path.join(self._base_dir, f"{run_id}.db")\n\n def conn_string_for_shard(self, shard_name: str) -> str:\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine: Engine) -> None:\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db_exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagster-webserver process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n re.search(r"table [A-Za-z_]* already exists", err_msg)\n or "database is locked" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying (%s retries left). Exception: %s",\n retry_limit,\n err_msg,\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard: str) -> Iterator[Connection]:\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if shard not in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n engine.dispose()\n\n def run_connection(self, run_id: Optional[str] = None) -> Any:\n return self._connect(run_id) # type: ignore # bad sig\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key: # type: ignore\n check.invariant(\n event.dagster_event_type in ASSET_EVENTS,\n "Can only store asset materializations, materialization_planned, and"\n " observations in index database",\n )\n\n event_id = None\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, None)\n\n if event.is_dagster_event and event.dagster_event_type in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n # should mirror run status change events in the index shard\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and event_records_filter.event_type in ASSET_EVENTS\n if is_asset_query:\n # asset materializations, observations and materialization planned events get mirrored\n # into the index shard, so no custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if event_records_filter.after_cursor is not None and not isinstance(\n event_records_filter.after_cursor, RunShardedEventsCursor\n ):\n raise Exception("""\n Called `get_event_records` on a run-sharded event log storage with a cursor that\n is not run-aware. Add a RunShardedEventsCursor to your query filter\n or switch your instance configuration to use a non-run-sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.dagster_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, EventLogEntry)\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except DeserializationError:\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self) -> None:\n # should delete all the run-sharded db files and drop the contents of the index\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n if (\n not filename.endswith(f"{INDEX_SHARD_NAME}.db")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-wal")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-shm")\n ):\n with contextlib.suppress(FileNotFoundError):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n self._wipe_index()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key: AssetKey) -> None:\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch) # type: ignore # (possible none)\n del self._watchers[run_id][handler]\n\n def dispose(self) -> None:\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.index_connection() as conn:\n return check_alembic_revision(alembic_config, conn)\n\n @property\n def is_run_sharded(self) -> bool:\n return True\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(\n self,\n event_log_storage: SqliteEventLogStorage,\n run_id: str,\n callback: EventHandlerFn,\n cursor: Optional[str],\n **kwargs: Any,\n ):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = cursor\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self) -> None:\n connection = self._event_log_storage.get_records_for_run(self._run_id, self._cursor)\n if connection.cursor:\n self._cursor = connection.cursor\n for record in connection.records:\n status = None\n try:\n status = self._cb(\n record.event_log_entry, str(EventLogCursor.from_storage_id(record.storage_id))\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event: FileSystemEvent) -> None:\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, ContextManager, Iterator, Optional, TextIO, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource, resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\nIOStream: TypeAlias = Union[TextIO, BinaryIO]\n\n\n
[docs]class FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager.\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @public\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]class LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @public\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @public\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC):\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @public\n @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def delete_local_temp(self) -> None:\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster._core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> ContextManager[IOStream]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write(self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context: InitResourceContext) -> "LocalFileManager":\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n\n Examples:\n .. code-block:: python\n\n import tempfile\n\n from dagster import job, local_file_manager, op\n\n\n @op(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @op(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @job(resource_defs={"file_manager": local_file_manager})\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @job(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n def files_pipeline():\n read_files(write_files())\n """\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager") # type: ignore # (possible none)\n )\n )
\n\n\ndef check_file_like_obj(obj: object) -> None:\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir: str):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance: DagsterInstance, run_id: str) -> "LocalFileManager":\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self) -> None:\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj: # type: ignore # (??)\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle: LocalFileHandle, mode: str = "rb") -> Iterator[IOStream]:\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n encoding = None if mode == "rb" else "utf8"\n with open(file_handle.path, mode, encoding=encoding) as file_obj:\n yield file_obj # type: ignore # (??)\n\n def read_data(self, file_handle: LocalFileHandle) -> bytes:\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read() # type: ignore # (??)\n\n def write_data(self, data: bytes, ext: Optional[str] = None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(\n self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None\n ) -> LocalFileHandle:\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n encoding = None if "b" in mode else "utf8"\n with open(dest_file_path, mode, encoding=encoding) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj) # type: ignore # (??)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self) -> None:\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/_core/storage/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.file_manager"}, "fs_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.fs_io_manager

\nimport os\nimport pickle\nfrom typing import TYPE_CHECKING, Any, Optional\n\nfrom pydantic import Field\n\nimport dagster._check as check\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Field as DagsterField,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import StringSource\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\nif TYPE_CHECKING:\n    from typing_extensions import Literal\n    from upath import UPath\n\n\n
[docs]class FilesystemIOManager(ConfigurableIOManagerFactory["PickledObjectFilesystemIOManager"]):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, FilesystemIOManager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": FilesystemIOManager()})\n def job():\n op_b(op_a())\n\n """\n\n base_dir: Optional[str] = Field(default=None, description="Base directory for storing files.")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n base_dir = self.base_dir or check.not_none(context.instance).storage_directory()\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=FilesystemIOManager.to_config_schema(),\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n)\ndef fs_io_manager(init_context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, fs_io_manager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n return FilesystemIOManager.from_resource_context(init_context)
\n\n\nclass PickledObjectFilesystemIOManager(UPathIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n Is compatible with local and remote filesystems via `universal-pathlib` and `fsspec`.\n Learn more about how to use remote filesystems here: https://github.com/fsspec/universal_pathlib.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n **kwargs: additional keyword arguments for `universal_pathlib.UPath`.\n """\n\n extension: str = "" # TODO: maybe change this to .pickle? Leaving blank for compatibility.\n\n def __init__(self, base_dir=None, **kwargs):\n from upath import UPath\n\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n\n super().__init__(base_path=UPath(base_dir, **kwargs))\n\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n try:\n with path.open("wb") as file:\n pickle.dump(obj, file, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.job_def.executor_def\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n ) from e\n\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n with path.open("rb") as file:\n return pickle.load(file)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir: Optional[str] = None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode: Literal["wb"] = "wb"\n self.read_mode: Literal["rb"] = "rb"\n\n def _get_path(self, path: str) -> str:\n return os.path.join(self.base_dir, path) # type: ignore # (possible none)\n\n def handle_output(self, context: OutputContext, obj: object):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.job_name, context.step_key, context.name]),\n metadata={"path": MetadataValue.path(os.path.abspath(filepath))},\n )\n\n def load_input(self, context: InputContext) -> object:\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata # type: ignore # (possible none)\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": DagsterField(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(\n init_context: InitResourceContext,\n) -> CustomPathPickledObjectFilesystemIOManager:\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )\n
", "current_page_name": "_modules/dagster/_core/storage/fs_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.fs_io_manager"}, "input_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.input_manager

\nfrom abc import ABC, abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Callable, Optional, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import has_at_least_one_parameter\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition, ResourceFunction\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nInputLoadFn: TypeAlias = Union[\n    Callable[["InputContext"], object],\n    Callable[[], object],\n]\n\n\n
[docs]class InputManager(ABC):\n """Base interface for classes that are responsible for loading solid inputs."""\n\n @abstractmethod\n def load_input(self, context: "InputContext") -> object:\n """The user-defined read method that loads an input to a solid.\n\n Args:\n context (InputContext): The input context.\n\n Returns:\n Any: The data object.\n """
\n\n\nclass IInputManagerDefinition:\n @property\n @abstractmethod\n def input_config_schema(self) -> IDefinitionConfigSchema:\n """The schema for per-input configuration for inputs that are managed by this\n input manager.\n """\n\n\n
[docs]class InputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of an input manager resource.\n\n Input managers load op inputs.\n\n An InputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`InputManager`.\n\n The easiest way to create an InputManagerDefinition is with the\n :py:func:`@input_manager <input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(InputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "InputManagerDefinition":\n return InputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n@overload\ndef input_manager(\n config_schema: InputLoadFn,\n) -> InputManagerDefinition: ...\n\n\n@overload\ndef input_manager(\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[InputLoadFn], InputManagerDefinition]: ...\n\n\n
[docs]def input_manager(\n config_schema: Union[InputLoadFn, Optional[CoercableToConfigSchema]] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[InputManagerDefinition, Callable[[InputLoadFn], InputManagerDefinition]]:\n """Define an input manager.\n\n Input managers load op inputs, either from upstream outputs or by providing default values.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`InputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import input_manager, op, job, In\n\n @input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(input_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n if _is_input_load_fn(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn: InputLoadFn) -> InputManagerDefinition:\n return _InputManagerDecoratorCallable(\n config_schema=cast(CoercableToConfigSchema, config_schema),\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\ndef _is_input_load_fn(obj: Union[InputLoadFn, CoercableToConfigSchema]) -> TypeGuard[InputLoadFn]:\n return callable(obj) and not is_callable_valid_config_arg(obj)\n\n\nclass InputManagerWrapper(InputManager):\n def __init__(self, load_fn: InputLoadFn):\n self._load_fn = load_fn\n\n def load_input(self, context: "InputContext") -> object:\n # the @input_manager decorated function (self._load_fn) may return a direct value that\n # should be used or an instance of an InputManager. So we call self._load_fn and see if the\n # result is an InputManager. If so we call it's load_input method\n intermediate = (\n # type-ignore because function being used as attribute\n self._load_fn(context)\n if has_at_least_one_parameter(self._load_fn)\n else self._load_fn() # type: ignore # (strict type guard)\n )\n\n if isinstance(intermediate, InputManager):\n return intermediate.load_input(context)\n return intermediate\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn: InputLoadFn) -> InputManagerDefinition:\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return InputManagerWrapper(load_fn)\n\n input_manager_def = InputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(input_manager_def, wrapped=load_fn) # type: ignore\n\n return input_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/input_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.input_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Optional, Set, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.storage.input_manager import IInputManagerDefinition, InputManager\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition, OutputManager\n\nfrom ..decorator_utils import get_function_params\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.init import InitResourceContext\n    from dagster._core.execution.context.input import InputContext\n    from dagster._core.execution.context.output import OutputContext\n\nIOManagerFunctionWithContext = Callable[["InitResourceContext"], "IOManager"]\nIOManagerFunction: TypeAlias = Union[\n    IOManagerFunctionWithContext,\n    Callable[[], "IOManager"],\n]\n\n\ndef is_io_manager_context_provided(\n    fn: IOManagerFunction,\n) -> TypeGuard[IOManagerFunctionWithContext]:\n    return len(get_function_params(fn)) >= 1\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn: IOManagerFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n output_config_schema: CoercableToConfigSchema = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any,\n # output_config_schema defaults to None. This the because IOManager input / output config\n # shares config namespace with dagster type loaders.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n @property\n def output_config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self._output_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "IOManagerDefinition":\n io_def = IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n io_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return io_def\n\n
[docs] @public\n @staticmethod\n def hardcoded_io_manager(\n value: "IOManager", description: Optional[str] = None\n ) -> "IOManagerDefinition":\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (IOManager): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @public\n @abstractmethod\n def load_input(self, context: "InputContext") -> Any:\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @public\n @abstractmethod\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n\n@overload\ndef io_manager(config_schema: IOManagerFunction) -> IOManagerDefinition: ...\n\n\n@overload\ndef io_manager(\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[IOManagerFunction], IOManagerDefinition]: ...\n\n\n
[docs]def io_manager(\n config_schema: Union[IOManagerFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Union[IOManagerDefinition, Callable[[IOManagerFunction], IOManagerDefinition],]:\n """Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n config_schema = cast(IOManagerFunction, config_schema)\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: IOManagerFunction) -> IOManagerDefinition:\n return _IOManagerDecoratorCallable(\n config_schema=cast(Optional[UserConfigSchema], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\ndef dagster_maintained_io_manager(io_manager_def: IOManagerDefinition) -> IOManagerDefinition:\n io_manager_def._dagster_maintained = True # noqa: SLF001\n return io_manager_def\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn: IOManagerFunction) -> IOManagerDefinition:\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(io_manager_def, wrapped=fn) # type: ignore\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport shutil\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import IO, TYPE_CHECKING, Generator, Iterator, Mapping, Optional, Sequence, Tuple\n\nfrom typing_extensions import Final\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import (\n    Field,\n    Float,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.execution.compute_logs import mirror_stream_to_file\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._seven import json\nfrom dagster._utils import ensure_dir, ensure_file, touch_file\n\nfrom .captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.cloud_storage_compute_log_manager import LogSubscription\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT: Final = 2.5\n\nIO_TYPE_EXTENSION: Final[Mapping[ComputeIOType, str]] = {\n    ComputeIOType.STDOUT: "out",\n    ComputeIOType.STDERR: "err",\n}\n\nMAX_FILENAME_LENGTH: Final = 255\n\n\n
[docs]class LocalComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(\n self,\n base_dir: str,\n polling_timeout: Optional[float] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def polling_timeout(self) -> float:\n return self._polling_timeout\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "LocalComputeLogManager":\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n outpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT])\n errpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR])\n with mirror_stream_to_file(sys.stdout, outpath), mirror_stream_to_file(sys.stderr, errpath):\n yield CapturedLogContext(log_key)\n\n # leave artifact on filesystem so that we know the capture is completed\n touch_file(self.complete_artifact_path(log_key))\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO]]:\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n with open(path, "+a", encoding="utf-8") as f:\n yield f\n\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n return os.path.exists(self.complete_artifact_path(log_key))\n\n def get_log_data(\n self, log_key: Sequence[str], cursor: Optional[str] = None, max_bytes: Optional[int] = None\n ) -> CapturedLogData:\n stdout_cursor, stderr_cursor = self.parse_cursor(cursor)\n stdout, stdout_offset = self._read_bytes(\n log_key, ComputeIOType.STDOUT, offset=stdout_cursor, max_bytes=max_bytes\n )\n stderr, stderr_offset = self._read_bytes(\n log_key, ComputeIOType.STDERR, offset=stderr_cursor, max_bytes=max_bytes\n )\n return CapturedLogData(\n log_key=log_key,\n stdout=stdout,\n stderr=stderr,\n cursor=self.build_cursor(stdout_offset, stderr_offset),\n )\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata(\n stdout_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]\n ),\n stderr_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]\n ),\n stdout_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDOUT),\n stderr_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDERR),\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n if log_key:\n paths = [\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n self.get_captured_local_path(log_key, "complete"),\n ]\n for path in paths:\n if os.path.exists(path) and os.path.isfile(path):\n os.remove(path)\n elif prefix:\n dir_to_delete = os.path.join(self._base_dir, *prefix)\n if os.path.exists(dir_to_delete) and os.path.isdir(dir_to_delete):\n # recursively delete all files in dir\n shutil.rmtree(dir_to_delete)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def _read_bytes(\n self,\n log_key: Sequence[str],\n io_type: ComputeIOType,\n offset: Optional[int] = 0,\n max_bytes: Optional[int] = None,\n ):\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n return self.read_path(path, offset or 0, max_bytes)\n\n def parse_cursor(self, cursor: Optional[str] = None) -> Tuple[int, int]:\n # Translates a string cursor into a set of byte offsets for stdout, stderr\n if not cursor:\n return 0, 0\n\n parts = cursor.split(":")\n if not parts or len(parts) != 2:\n return 0, 0\n\n stdout, stderr = [int(_) for _ in parts]\n return stdout, stderr\n\n def build_cursor(self, stdout_offset: int, stderr_offset: int) -> str:\n return f"{stdout_offset}:{stderr_offset}"\n\n def complete_artifact_path(self, log_key):\n return self.get_captured_local_path(log_key, "complete")\n\n def read_path(\n self,\n path: str,\n offset: int = 0,\n max_bytes: Optional[int] = None,\n ):\n if not os.path.exists(path) or not os.path.isfile(path):\n return None, offset\n\n with open(path, "rb") as f:\n f.seek(offset, os.SEEK_SET)\n if max_bytes is None:\n data = f.read()\n else:\n data = f.read(max_bytes)\n new_offset = f.tell()\n return data, new_offset\n\n def get_captured_log_download_url(self, log_key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n url = "/logs"\n for part in log_key:\n url = f"{url}/{part}"\n\n return f"{url}/{IO_TYPE_EXTENSION[io_type]}"\n\n def get_captured_local_path(self, log_key: Sequence[str], extension: str, partial=False):\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(filebase.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._base_dir, *namespace, filename)\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n subscription = CapturedLogSubscription(self, log_key, cursor)\n self.on_subscribe(subscription)\n return subscription\n\n def unsubscribe(self, subscription):\n self.on_unsubscribe(subscription)\n\n ###############################################\n #\n # Methods for the ComputeLogManager interface\n #\n ###############################################\n @contextmanager\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n with self.capture_logs(log_key):\n yield\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Legacy adapter from compute log manager to more generic captured log manager API."""\n check.inst_param(io_type, "io_type", ComputeIOType)\n log_key = self.build_log_key_for_run(run_id, key)\n return self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def get_key(self, dagster_run: DagsterRun, step_key: Optional[str]):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or dagster_run.job_name\n\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n log_key = self.build_log_key_for_run(run_id, key)\n return self.is_capture_complete(log_key)\n\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]):\n pass\n\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str] = None):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n touchpath = self.complete_artifact_path(log_key)\n touch_file(touchpath)\n\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return f"/download/{run_id}/{key}/{io_type.value}"\n\n def on_subscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self) -> None:\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def add_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if self.is_complete(subscription):\n subscription.fetch()\n subscription.complete()\n else:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription)\n\n def is_complete(self, subscription: "LogSubscription") -> bool:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.is_watch_completed(subscription.run_id, subscription.key)\n return self._manager.is_capture_complete(subscription.log_key)\n\n def remove_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def _log_key(self, subscription: "LogSubscription") -> Sequence[str]:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.build_log_key_for_run(subscription.run_id, subscription.key)\n return subscription.log_key\n\n def _watch_key(self, log_key: Sequence[str]) -> str:\n return json.dumps(log_key)\n\n def remove_all_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, subscription: "LogSubscription") -> None:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n ]\n complete_paths = [self._manager.complete_artifact_path(log_key)]\n directory = os.path.dirname(\n self._manager.get_captured_local_path(log_key, ComputeIOType.STDERR),\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(self, log_key, update_paths, complete_paths),\n str(directory),\n )\n\n def notify_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, log_key: Sequence[str], handler) -> None:\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key]) # type: ignore\n del self._watchers[watch_key]\n\n def dispose(self) -> None:\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, log_key, update_paths, complete_paths):\n self.manager = manager\n self.log_key = log_key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.log_key)\n self.manager.unwatch(self.log_key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.log_key)\n
", "current_page_name": "_modules/dagster/_core/storage/local_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.mem_io_manager

\nfrom typing import Dict, Tuple\n\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\n\n\n
[docs]class InMemoryIOManager(IOManager):\n """I/O manager that stores and retrieves values in memory. After execution is complete, the values will\n be garbage-collected. Note that this means that each run will not have access to values from previous runs.\n """\n\n def __init__(self):\n self.values: Dict[Tuple[object, ...], object] = {}\n\n def handle_output(self, context: OutputContext, obj: object):\n keys = tuple(context.get_identifier())\n self.values[keys] = obj\n\n def load_input(self, context: InputContext) -> object:\n keys = tuple(context.get_identifier())\n return self.values[keys]
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(description="Built-in IO manager that stores and retrieves values in memory.")\ndef mem_io_manager(_) -> InMemoryIOManager:\n """Built-in IO manager that stores and retrieves values in memory."""\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/_core/storage/mem_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @public\n @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n output_context: OutputContext\n\n if isinstance(context, OutputContext):\n output_context = context\n else:\n if context.upstream_output is None:\n raise DagsterInvariantViolationError(\n "Missing value of InputContext.upstream_output. Cannot compute the input path."\n )\n\n output_context = context.upstream_output\n\n # automatically construct filepath\n step_key = check.str_param(output_context.step_key, "context.step_key")\n output_name = check.str_param(output_context.name, "context.name")\n version = check.str_param(output_context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/_core/storage/memoizable_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.memoizable_io_manager"}, "noop_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.noop_compute_log_manager

\nfrom contextlib import contextmanager\nfrom typing import IO, Any, Generator, Mapping, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.storage.captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\n\n\n
[docs]class NoOpComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """When enabled for a Dagster instance, stdout and stderr will not be available for any step."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return NoOpComputeLogManager(inst_data=inst_data, **config_value)\n\n def enabled(self, _dagster_run, _step_key):\n return False\n\n def _watch_logs(self, dagster_run, step_key=None):\n pass\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n raise NotImplementedError()\n\n def is_watch_completed(self, run_id, key):\n return True\n\n def on_watch_start(self, dagster_run, step_key):\n pass\n\n def on_watch_finish(self, dagster_run, step_key):\n pass\n\n def download_url(self, run_id, key, io_type):\n return None\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n return ComputeLogFileData(\n path=f"{key}.{io_type}", data=None, cursor=0, size=0, download_url=None\n )\n\n def on_subscribe(self, subscription):\n pass\n\n def on_unsubscribe(self, subscription):\n pass\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n yield CapturedLogContext(log_key=log_key)\n\n def is_capture_complete(self, log_key: Sequence[str]):\n return True\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Generator[Optional[IO], None, None]:\n yield None\n\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n return CapturedLogData(log_key=log_key)\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata()\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n pass\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n return CapturedLogSubscription(self, log_key, cursor)\n\n def unsubscribe(self, subscription: CapturedLogSubscription):\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/noop_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.noop_compute_log_manager"}, "root": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.root

\nimport os\nfrom tempfile import TemporaryDirectory\nfrom typing import Optional\n\nfrom typing_extensions import TypedDict\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\n\nclass LocalArtifactStorageConfig(TypedDict):\n    base_dir: str\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def base_dir(self) -> str:\n return self._base_dir\n\n def file_manager_dir(self, run_id: str) -> str:\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self) -> str:\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self) -> str:\n return os.path.join(self.base_dir, "schedules")\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: LocalArtifactStorageConfig\n ) -> "LocalArtifactStorage":\n return LocalArtifactStorage(inst_data=inst_data, **config_value)\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n def dispose(self):\n pass
\n\n\nclass TemporaryLocalArtifactStorage(LocalArtifactStorage):\n """Used by ephemeral DagsterInstances, defers directory creation til\n access since many uses of ephemeral instance do not require artifact directory.\n """\n\n def __init__(self):\n self._temp_dir = None\n\n @property\n def base_dir(self):\n if self._temp_dir is None:\n self._temp_dir = TemporaryDirectory()\n return self._temp_dir.name\n\n def dispose(self):\n if self._temp_dir:\n self._temp_dir.cleanup()\n
", "current_page_name": "_modules/dagster/_core/storage/root", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.root"}, "runs": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Mapping, Optional, Sequence, Set, Tuple, Union\n\nfrom typing_extensions import TypedDict\n\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\nfrom dagster._core.storage.dagster_run import (\n    DagsterRun,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._utils import PrintFn\n\nfrom ..daemon_cursor import DaemonCursorStorage\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\nclass RunGroupInfo(TypedDict):\n    count: int\n    runs: Sequence[DagsterRun]\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance], DaemonCursorStorage):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n dagster_run (DagsterRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n """Update run storage in accordance to a pipeline run related DagsterEvent.\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n """Return all the run IDs for runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Sequence[str]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Args:\n tag_keys (Optional[Sequence[str]]): tag keys to filter by.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def get_run_tag_keys(self) -> Sequence[str]:\n """Get a list of tag keys.\n\n Returns:\n List[str]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[JobSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ) -> None:\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, JobSnapshot):\n self.add_job_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_job_snapshot(snapshot_id) or self.has_execution_plan_snapshot(snapshot_id)\n\n @abstractmethod\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n job_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The job_snapshot_id\n """\n\n @abstractmethod\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n job_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self) -> None:\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str) -> None:\n """Remove a run from storage."""\n\n @property\n def supports_bucket_queries(self) -> bool:\n return False\n\n @abstractmethod\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n """Called on a regular interval by the daemon."""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types."""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self) -> None:\n """Wipe all daemon heartbeats."""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n """Get a list of partition backfills."""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage."""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @abstractmethod\n def replace_job_origin(self, run: "DagsterRun", job_origin: "ExternalJobOrigin") -> None: ...
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster._core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.host_representation.origin import ExternalJobOrigin\nfrom dagster._core.snap import (\n    ExecutionPlanSnapshot,\n    JobSnapshot,\n    create_execution_plan_snapshot_id,\n    create_job_snapshot_id,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_fetch_mappings,\n    db_scalar_subquery,\n    db_select,\n    db_subquery,\n)\nfrom dagster._core.storage.tags import (\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    REPOSITORY_LABEL_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..dagster_run import (\n    DagsterRun,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom .base import RunStorage\nfrom .migration import (\n    OPTIONAL_DATA_MIGRATIONS,\n    REQUIRED_DATA_MIGRATIONS,\n    RUN_PARTITIONS,\n    MigrationFn,\n)\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    KeyValueStoreTable,\n    RunsTable,\n    RunTagsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage):\n """Base class for SQL based run storages."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query: SqlAlchemyQuery) -> Sequence[Any]:\n with self.connect() as conn:\n return db_fetch_mappings(conn, query)\n\n def fetchone(self, query: SqlAlchemyQuery) -> Optional[Any]:\n with self.connect() as conn:\n if db.__version__.startswith("2."):\n return conn.execute(query).mappings().first()\n else:\n return conn.execute(query).fetchone()\n\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n if dagster_run.job_snapshot_id and not self.has_job_snapshot(dagster_run.job_snapshot_id):\n raise DagsterSnapshotDoesNotExist(\n f"Snapshot {dagster_run.job_snapshot_id} does not exist in run storage"\n )\n\n has_tags = dagster_run.tags and len(dagster_run.tags) > 0\n partition = dagster_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = dagster_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values(\n run_id=dagster_run.run_id,\n pipeline_name=dagster_run.job_name,\n status=dagster_run.status.value,\n run_body=serialize_value(dagster_run),\n snapshot_id=dagster_run.job_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db_exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n tags_to_insert = dagster_run.tags_for_storage()\n if tags_to_insert:\n conn.execute(\n RunTagsTable.insert(),\n [\n dict(run_id=dagster_run.run_id, key=k, value=v)\n for k, v in tags_to_insert.items()\n ],\n )\n\n return dagster_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self._get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_job_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_status(new_job_status)),\n status=new_job_status.value,\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Dict) -> DagsterRun:\n run = deserialize_value(row["run_body"], DagsterRun)\n status = DagsterRunStatus(row["status"])\n # NOTE: the status column is more trustworthy than the status in the run body, since concurrent\n # writes (e.g. handle_run_event and add_tags) can cause the status in the body to be out of\n # overriden with an old value.\n return run.with_status(status)\n\n def _rows_to_runs(self, rows: Iterable[Dict]) -> Sequence[DagsterRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query: SqlAlchemyQuery,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ) -> SqlAlchemyQuery:\n """Helper function to deal with cursor/limit pagination args."""\n if cursor:\n cursor_query = db_select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < db_scalar_subquery(cursor_query))\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def _add_filters_to_query(self, query: SqlAlchemyQuery, filters: RunsFilter) -> SqlAlchemyQuery:\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.updated_before:\n query = query.where(RunsTable.c.update_timestamp < filters.updated_before)\n\n if filters.created_after:\n query = query.where(RunsTable.c.create_timestamp > filters.created_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[Sequence[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> SqlAlchemyQuery:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body", "status"]\n\n if filters.tags:\n table = self._apply_tags_table_joins(RunsTable, filters.tags)\n else:\n table = RunsTable\n\n base_query = db_select([getattr(RunsTable.c, column) for column in columns]).select_from(\n table\n )\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n ) -> db.Table:\n multi_join = len(tags) > 1\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = (\n db_subquery(db_select([RunTagsTable]), f"run_tags_subquery_{i}")\n if multi_join\n else RunTagsTable\n )\n table = table.join(\n tags_table,\n db.and_(\n RunsTable.c.run_id == tags_table.c.run_id,\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n query = self._runs_query(filters=filters, cursor=cursor, limit=limit, columns=["run_id"])\n rows = self.fetchall(query)\n return [row["run_id"] for row in rows]\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = db_subquery(self._runs_query(filters=filters))\n query = db_select([db.func.count().label("count")]).select_from(subquery)\n row = self.fetchone(query)\n count = row["count"] if row else 0\n return count\n\n def _get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n check.str_param(run_id, "run_id")\n\n query = db_select([RunsTable.c.run_body, RunsTable.c.status]).where(\n RunsTable.c.run_id == run_id\n )\n rows = self.fetchall(query)\n return self._row_to_run(rows[0]) if rows else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "status", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n dagster_run=self._row_to_run(row),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=(\n check.opt_inst(row["start_time"], float) if "start_time" in row else None\n ),\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = (\n db_select([RunTagsTable.c.key, RunTagsTable.c.value])\n .distinct()\n .order_by(RunTagsTable.c.key, RunTagsTable.c.value)\n )\n if tag_keys:\n query = query.where(RunTagsTable.c.key.in_(tag_keys))\n if value_prefix:\n query = query.where(RunTagsTable.c.value.startswith(value_prefix))\n if limit:\n query = query.limit(limit)\n rows = self.fetchall(query)\n for r in rows:\n result[r["key"]].add(r["value"])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def get_run_tag_keys(self) -> Sequence[str]:\n query = db_select([RunTagsTable.c.key]).distinct().order_by(RunTagsTable.c.key)\n rows = self.fetchall(query)\n return sorted([r["key"] for r in rows])\n\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n check.str_param(run_id, "run_id")\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self._get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_tags(merge_dicts(current_tags, new_tags))),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update()\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(),\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Tuple[str, Sequence[DagsterRun]]:\n check.str_param(run_id, "run_id")\n dagster_run = self._get_run_by_id(run_id)\n if not dagster_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = dagster_run.root_run_id if dagster_run.root_run_id else dagster_run.run_id\n root_run = self._get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run_id} set as root run id for run {run_id} was not found in"\n " instance.",\n invalid_run_id=root_run_id,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = db_subquery(\n db_select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n ).where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n ),\n "root_to_run",\n )\n # get run group\n run_group_query = db_select([RunsTable.c.run_body, RunsTable.c.status]).select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n\n res = self.fetchall(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run, *run_group])\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self._get_run_by_id(run_id))\n\n def delete_run(self, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._has_snapshot_id(job_snapshot_id)\n\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_job_snapshot_id(job_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=job_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._get_snapshot(job_snapshot_id) # type: ignore # (allowed to return None?)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id) # type: ignore # (allowed to return None?)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = SnapshotsTable.insert().values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n try:\n conn.execute(snapshot_insert)\n except db_exc.IntegrityError:\n # on_conflict_do_nothing equivalent\n pass\n\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db_select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row["run_storage_id"]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db_select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str) -> Optional[JobSnapshot]:\n query = db_select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_execution_plan_snapshot_query(logging, [row["snapshot_body"]]) if row else None # type: ignore\n\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n if self.has_built_index(RUN_PARTITIONS) and self.has_run_stats_index_cols():\n query = self._runs_query(\n filters=runs_filter,\n columns=["run_id", "status", "start_time", "end_time", "partition"],\n )\n rows = self.fetchall(query)\n\n # dedup by partition\n _partition_data_by_partition = {}\n for row in rows:\n if not row["partition"] or row["partition"] in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[row["partition"]] = RunPartitionData(\n run_id=row["run_id"],\n partition=row["partition"],\n status=DagsterRunStatus[row["status"]],\n start_time=row["start_time"],\n end_time=row["end_time"],\n )\n\n return list(_partition_data_by_partition.values())\n else:\n query = self._runs_query(filters=runs_filter)\n rows = self.fetchall(query)\n _partition_data_by_partition = {}\n for row in rows:\n run = self._row_to_run(row)\n partition = run.tags.get(PARTITION_NAME_TAG)\n if not partition or partition in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[partition] = RunPartitionData(\n run_id=run.run_id,\n partition=partition,\n status=run.status,\n start_time=None,\n end_time=None,\n )\n\n return list(_partition_data_by_partition.values())\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> Sequence[DagsterRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[[], MigrationFn]],\n print_fn: Optional[PrintFn] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n results = self.fetchall(query)\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n def has_bulk_actions_selector_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [\n x.get("name") for x in db.inspect(conn).get_columns(BulkActionsTable.name)\n ]\n return "selector_id" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update()\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n rows = self.fetchall(db_select([DaemonHeartbeatsTable.c.body]))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_value(row["body"], DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self) -> None:\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete())\n conn.execute(RunTagsTable.delete())\n conn.execute(SnapshotsTable.delete())\n conn.execute(DaemonHeartbeatsTable.delete())\n conn.execute(BulkActionsTable.delete())\n\n def wipe_daemon_heartbeats(self) -> None:\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete())\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db_select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db_select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_value(row["body"], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db_select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_value(row["body"], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n values: Dict[str, Any] = dict(\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_value(cast(NamedTuple, partition_backfill)),\n )\n\n if self.has_bulk_actions_selector_cols():\n values["selector_id"] = partition_backfill.selector_id\n values["action_type"] = partition_backfill.bulk_action_type.value\n\n with self.connect() as conn:\n conn.execute(BulkActionsTable.insert().values(**values))\n\n def update_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update()\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_value(partition_backfill),\n )\n )\n\n def get_cursor_values(self, keys: Set[str]) -> Mapping[str, str]:\n check.set_param(keys, "keys", of_type=str)\n\n rows = self.fetchall(\n db_select([KeyValueStoreTable.c.key, KeyValueStoreTable.c.value]).where(\n KeyValueStoreTable.c.key.in_(keys)\n ),\n )\n return {row["key"]: row["value"] for row in rows}\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n try:\n conn.execute(KeyValueStoreTable.insert().values(db_values))\n except db_exc.IntegrityError:\n conn.execute(\n KeyValueStoreTable.update()\n .where(KeyValueStoreTable.c.key.in_(pairs.keys()))\n .values(value=db.sql.case(pairs, value=KeyValueStoreTable.c.key))\n )\n\n # Migrating run history\n def replace_job_origin(self, run: DagsterRun, job_origin: ExternalJobOrigin) -> None:\n new_label = job_origin.external_repository_origin.get_label()\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run.run_id)\n .values(\n run_body=serialize_value(run.with_job_origin(job_origin)),\n )\n )\n conn.execute(\n RunTagsTable.update()\n .where(RunTagsTable.c.run_id == run.run_id)\n .where(RunTagsTable.c.key == REPOSITORY_LABEL_TAG)\n .values(value=new_label)\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_execution_plan_snapshot_query(\n logger: logging.Logger, row: Sequence[Any]\n) -> Optional[Union[ExecutionPlanSnapshot, JobSnapshot]]:\n # minimal checking here because sqlalchemy returns a different type based on what version of\n # SqlAlchemy you are using\n\n def _warn(msg: str) -> None:\n logger.warning(f"get-pipeline-snapshot: {msg}")\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_value(decoded_str, (ExecutionPlanSnapshot, JobSnapshot))\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sql_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, Optional\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunsTable, RunStorageSqlMetadata, RunTagsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteRunStorage":\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None) -> Self:\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def _alembic_upgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n def upgrade(self) -> None:\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self) -> None:\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id: str) -> None:\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes.\n """\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.base

\nimport abc\nfrom typing import Mapping, Optional, Sequence, Set\n\nfrom dagster import AssetKey\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._utils import PrintFn\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract class for managing persistance of scheduler artifacts."""\n\n @abc.abstractmethod\n def wipe(self) -> None:\n """Delete all schedules from storage."""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n """Return all InstigationStates present in storage.\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n instigator_statuses (Optional[Set[InstigatorStatus]]): The InstigatorStatuses to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n """Return the instigator state for the given id.\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self) -> bool:\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n before (datetime): All ticks before this datetime will get purged\n tick_statuses (Optional[List[TickStatus]]): The tick statuses to wipe\n """\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n return True\n\n @abc.abstractmethod\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ) -> None:\n """Add asset policy evaluations to storage."""\n\n @abc.abstractmethod\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get the policy evaluations for a given asset.\n\n Args:\n asset_key (AssetKey): The asset key to query\n limit (Optional[int]): The maximum number of evaluations to return\n cursor (Optional[int]): The cursor to paginate from\n """\n\n @abc.abstractmethod\n def get_auto_materialize_evaluations_for_evaluation_id(\n self, evaluation_id: int\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get all policy evaluations for a given evaluation ID.\n\n Args:\n evaluation_id (int): The evaluation ID to query.\n """\n\n @abc.abstractmethod\n def purge_asset_evaluations(self, before: float) -> None:\n """Wipe evaluations before a certain timestamp.\n\n Args:\n before (datetime): All evaluations before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self) -> None:\n """Perform any needed migrations."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    TypeVar,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import db_fetch_mappings, db_select, db_subquery\nfrom dagster._serdes import serialize_value\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import (\n    AssetDaemonAssetEvaluationsTable,\n    InstigatorsTable,\n    JobTable,\n    JobTickTable,\n    SecondaryIndexMigrationTable,\n)\n\nT_NamedTuple = TypeVar("T_NamedTuple", bound=NamedTuple)\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query: SqlAlchemyQuery) -> Sequence[SqlAlchemyRow]:\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(\n self, rows: Sequence[SqlAlchemyRow], as_type: Type[T_NamedTuple]\n ) -> Sequence[T_NamedTuple]:\n return list(map(lambda r: deserialize_value(r[0], as_type), rows))\n\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db_select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n InstigatorsTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n else:\n query = db_select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n JobTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows, InstigatorState)\n\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db_select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1], InstigatorState)[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id: str) -> bool:\n check.str_param(selector_id, "selector_id")\n\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None # type: ignore\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values(\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is not present in storage"\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update()\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {origin_id} is not present in storage"\n )\n\n with self.connect() as conn:\n conn.execute(JobTable.delete().where(JobTable.c.job_origin_id == origin_id))\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where(\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn: Connection, selector_id: str) -> bool:\n query = (\n db_select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0 # type: ignore # (possible none)\n\n def _add_filter_limit(\n self,\n query: SqlAlchemyQuery,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses=None,\n ) -> SqlAlchemyQuery:\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self) -> bool:\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self) -> bool:\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def _has_asset_daemon_asset_evaluations_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "asset_daemon_asset_evaluations" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n check.sequence_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = db_subquery(\n db_select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db_select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = deserialize_value(row[2], TickData)\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db_select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(map(lambda r: InstigatorTick(r[0], deserialize_value(r[1], TickData)), rows))\n\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_value(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(**values)\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in"\n " storage"\n ) from exc\n\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_value(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update().where(JobTickTable.c.id == tick.tick_id).values(**values)\n )\n\n return tick\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n check.str_param(origin_id, "origin_id")\n check.float_param(before, "before")\n check.opt_list_param(tick_statuses, "tick_statuses", of_type=TickStatus)\n\n utc_before = utc_datetime_from_timestamp(before)\n\n query = JobTickTable.delete().where(JobTickTable.c.timestamp < utc_before)\n if tick_statuses:\n query = query.where(\n JobTickTable.c.status.in_([tick_status.value for tick_status in tick_statuses])\n )\n\n if self.has_instigators_table():\n query = query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n with self.connect() as conn:\n return self._has_asset_daemon_asset_evaluations_table(conn)\n\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ):\n if not asset_evaluations:\n return\n\n with self.connect() as conn:\n bulk_insert = AssetDaemonAssetEvaluationsTable.insert().values(\n [\n {\n "evaluation_id": evaluation_id,\n "asset_key": evaluation.asset_key.to_string(),\n "asset_evaluation_body": serialize_value(evaluation),\n "num_requested": evaluation.num_requested,\n "num_skipped": evaluation.num_skipped,\n "num_discarded": evaluation.num_discarded,\n }\n for evaluation in asset_evaluations\n ]\n )\n conn.execute(bulk_insert)\n\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = (\n db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n AssetDaemonAssetEvaluationsTable.c.asset_key,\n ]\n )\n .where(AssetDaemonAssetEvaluationsTable.c.asset_key == asset_key.to_string())\n .order_by(AssetDaemonAssetEvaluationsTable.c.evaluation_id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetDaemonAssetEvaluationsTable.c.evaluation_id < cursor)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def get_auto_materialize_evaluations_for_evaluation_id(\n self, evaluation_id: int\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n AssetDaemonAssetEvaluationsTable.c.asset_key,\n ]\n ).where(AssetDaemonAssetEvaluationsTable.c.evaluation_id == evaluation_id)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def purge_asset_evaluations(self, before: float):\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n query = AssetDaemonAssetEvaluationsTable.delete().where(\n AssetDaemonAssetEvaluationsTable.c.create_timestamp < utc_before\n )\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self) -> None:\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete())\n conn.execute(JobTickTable.delete())\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n if self._has_asset_daemon_asset_evaluations_table(conn):\n conn.execute(AssetDaemonAssetEvaluationsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self) -> bool:\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[..., Any]],\n print_fn: Optional[Callable] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sql_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\nfrom typing import Iterator, Optional\n\nimport sqlalchemy as db\nfrom packaging.version import parse\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage."""\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "SqliteScheduleStorage":\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(\n cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None\n ) -> "SqliteScheduleStorage":\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n @property\n def supports_batch_queries(self) -> bool:\n if not super().supports_batch_queries:\n return False\n\n return super().supports_batch_queries and parse(get_sqlite_version()) >= parse(\n MINIMUM_SQLITE_BATCH_VERSION\n )\n\n def upgrade(self) -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sqlite.sqlite_schedule_storage"}}}, "upath_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.upath_io_manager

\nimport asyncio\nimport inspect\nfrom abc import abstractmethod\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union\n\nfrom fsspec import AbstractFileSystem\nfrom fsspec.implementations.local import LocalFileSystem\n\nfrom dagster import (\n    InputContext,\n    MetadataValue,\n    MultiPartitionKey,\n    OutputContext,\n    _check as check,\n)\nfrom dagster._core.storage.memoizable_io_manager import MemoizableIOManager\n\nif TYPE_CHECKING:\n    from upath import UPath\n\n\n
[docs]class UPathIOManager(MemoizableIOManager):\n """Abstract IOManager base class compatible with local and cloud storage via `universal-pathlib` and `fsspec`.\n\n Features:\n - handles partitioned assets\n - handles loading a single upstream partition\n - handles loading multiple upstream partitions (with respect to :py:class:`PartitionMapping`)\n - supports loading multiple partitions concurrently with async `load_from_path` method\n - the `get_metadata` method can be customized to add additional metadata to the output\n - the `allow_missing_partitions` metadata value can be set to `True` to skip missing partitions\n (the default behavior is to raise an error)\n\n """\n\n extension: Optional[str] = None # override in child class\n\n def __init__(\n self,\n base_path: Optional["UPath"] = None,\n ):\n from upath import UPath\n\n assert not self.extension or "." in self.extension\n self._base_path = base_path or UPath(".")\n\n @abstractmethod\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n """Child classes should override this method to write the object to the filesystem."""\n\n @abstractmethod\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n """Child classes should override this method to load the object from the filesystem."""\n\n @property\n def fs(self) -> AbstractFileSystem:\n """Utility function to get the IOManager filesystem.\n\n Returns:\n AbstractFileSystem: fsspec filesystem.\n\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path.fs\n elif isinstance(self._base_path, Path):\n return LocalFileSystem()\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n @property\n def storage_options(self) -> Dict[str, Any]:\n """Utility function to get the fsspec storage_options which are often consumed by various I/O functions.\n\n Returns:\n Dict[str, Any]: fsspec storage_options.\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path._kwargs.copy() # noqa\n elif isinstance(self._base_path, Path):\n return {}\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n def get_metadata(\n self,\n context: OutputContext,\n obj: Any,\n ) -> Dict[str, MetadataValue]:\n """Child classes should override this method to add custom metadata to the outputs."""\n return {}\n\n # Read/write operations on paths can generally be handled by methods on the\n # UPath class, but when the backend requires credentials, this isn't\n # always possible. Override these path_* methods to provide custom\n # implementations for targeting backends that require authentication.\n\n def unlink(self, path: "UPath") -> None:\n """Remove the file or object at the provided path."""\n path.unlink()\n\n def path_exists(self, path: "UPath") -> bool:\n """Check if a file or object exists at the provided path."""\n return path.exists()\n\n def make_directory(self, path: "UPath"):\n """Create a directory at the provided path.\n\n Override as a no-op if the target backend doesn't use directories.\n """\n path.mkdir(parents=True, exist_ok=True)\n\n def has_output(self, context: OutputContext) -> bool:\n return self.path_exists(self._get_path(context))\n\n def _with_extension(self, path: "UPath") -> "UPath":\n return path.with_suffix(path.suffix + self.extension) if self.extension else path\n\n def _get_path_without_extension(self, context: Union[InputContext, OutputContext]) -> "UPath":\n if context.has_asset_key:\n context_path = self.get_asset_relative_path(context)\n else:\n # we are dealing with an op output\n context_path = self.get_op_output_relative_path(context)\n\n return self._base_path.joinpath(context_path)\n\n def get_asset_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n # we are not using context.get_asset_identifier() because it already includes the partition_key\n return UPath(*context.asset_key.path)\n\n def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n return UPath(*context.get_identifier())\n\n def get_loading_input_log_message(self, path: "UPath") -> str:\n return f"Loading file from: {path} using {self.__class__.__name__}..."\n\n def get_writing_output_log_message(self, path: "UPath") -> str:\n return f"Writing file at: {path} using {self.__class__.__name__}..."\n\n def get_loading_input_partition_log_message(self, path: "UPath", partition_key: str) -> str:\n return f"Loading partition {partition_key} from {path} using {self.__class__.__name__}..."\n\n def get_missing_partition_log_message(self, partition_key: str) -> str:\n return (\n f"Couldn't load partition {partition_key} and skipped it "\n "because the input metadata includes allow_missing_partitions=True"\n )\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n """Returns the I/O path for a given context.\n Should not be used with partitions (use `_get_paths_for_partitions` instead).\n """\n path = self._get_path_without_extension(context)\n return self._with_extension(path)\n\n def get_path_for_partition(\n self, context: Union[InputContext, OutputContext], path: "UPath", partition: str\n ) -> "UPath":\n """Override this method if you want to use a different partitioning scheme\n (for example, if the saving function handles partitioning instead).\n The extension will be added later.\n\n Args:\n context (Union[InputContext, OutputContext]): The context for the I/O operation.\n path (UPath): The path to the file or object.\n partition (str): Formatted partition/multipartition key\n\n Returns:\n UPath: The path to the file with the partition key appended.\n """\n return path / partition\n\n def _get_paths_for_partitions(\n self, context: Union[InputContext, OutputContext]\n ) -> Dict[str, "UPath"]:\n """Returns a dict of partition_keys into I/O paths for a given context."""\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str:\n ordered_dimension_keys = [\n key[1]\n for key in sorted(partition_key.keys_by_dimension.items(), key=lambda x: x[0])\n ]\n return "/".join(ordered_dimension_keys)\n\n formatted_partition_keys = {\n partition_key: (\n _formatted_multipartitioned_path(partition_key)\n if isinstance(partition_key, MultiPartitionKey)\n else partition_key\n )\n for partition_key in context.asset_partition_keys\n }\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(\n self.get_path_for_partition(context, asset_path, partition)\n )\n for partition_key, partition in formatted_partition_keys.items()\n }\n\n def _get_multipartition_backcompat_paths(\n self, context: Union[InputContext, OutputContext]\n ) -> Mapping[str, "UPath"]:\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n partition_keys = context.asset_partition_keys\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(asset_path / partition_key)\n for partition_key in partition_keys\n if isinstance(partition_key, MultiPartitionKey)\n }\n\n def _load_single_input(\n self, path: "UPath", context: InputContext, backcompat_path: Optional["UPath"] = None\n ) -> Any:\n context.log.debug(self.get_loading_input_log_message(path))\n try:\n obj = self.load_from_path(context=context, path=path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=backcompat_path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n except FileNotFoundError:\n raise e\n else:\n raise e\n\n context.add_input_metadata({"path": MetadataValue.path(str(path))})\n return obj\n\n def _load_partition_from_path(\n self,\n context: InputContext,\n partition_key: str,\n path: "UPath",\n backcompat_path: Optional["UPath"] = None,\n ) -> Any:\n """1. Try to load the partition from the normal path.\n 2. If it was not found, try to load it from the backcompat path.\n 3. If allow_missing_partitions metadata is True, skip the partition if it was not found in any of the paths.\n Otherwise, raise an error.\n\n Args:\n context (InputContext): IOManager Input context\n partition_key (str): the partition key corresponding to the partition being loaded\n path (UPath): The path to the partition.\n backcompat_path (Optional[UPath]): The path to the partition in the backcompat location.\n\n Returns:\n Any: The object loaded from the partition.\n """\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n try:\n context.log.debug(self.get_loading_input_partition_log_message(path, partition_key))\n obj = self.load_from_path(context=context, path=path)\n return obj\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=path)\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n return obj\n except FileNotFoundError as e:\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n\n def _load_multiple_inputs(self, context: InputContext) -> Dict[str, Any]:\n # load multiple partitions\n paths = self._get_paths_for_partitions(context) # paths for normal partitions\n backcompat_paths = self._get_multipartition_backcompat_paths(\n context\n ) # paths for multipartitions\n\n context.log.debug(f"Loading {len(paths)} partitions...")\n\n objs = {}\n\n if not inspect.iscoroutinefunction(self.load_from_path):\n for partition_key in context.asset_partition_keys:\n obj = self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n if obj is not None: # in case some partitions were skipped\n objs[partition_key] = obj\n return objs\n else:\n # load_from_path returns a coroutine, so we need to await the results\n\n async def collect():\n loop = asyncio.get_running_loop()\n\n tasks = []\n\n for partition_key in context.asset_partition_keys:\n tasks.append(\n loop.create_task(\n self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n )\n )\n\n results = await asyncio.gather(*tasks, return_exceptions=True)\n\n # need to handle missing partitions here because exceptions don't get propagated from async calls\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n results_without_errors = []\n found_errors = False\n for partition_key, result in zip(context.asset_partition_keys, results):\n if isinstance(result, FileNotFoundError):\n if allow_missing_partitions:\n context.log.warning(\n self.get_missing_partition_log_message(partition_key)\n )\n else:\n context.log.error(str(result))\n found_errors = True\n elif isinstance(result, Exception):\n context.log.error(str(result))\n found_errors = True\n else:\n results_without_errors.append(result)\n\n if found_errors:\n raise RuntimeError(\n f"{len(paths) - len(results_without_errors)} partitions could not be loaded"\n )\n\n return results_without_errors\n\n awaited_objects = asyncio.get_event_loop().run_until_complete(collect())\n\n return {\n partition_key: awaited_object\n for partition_key, awaited_object in zip(\n context.asset_partition_keys, awaited_objects\n )\n if awaited_object is not None\n }\n\n def load_input(self, context: InputContext) -> Union[Any, Dict[str, Any]]:\n # If no asset key, we are dealing with an op output which is always non-partitioned\n if not context.has_asset_key or not context.has_asset_partitions:\n path = self._get_path(context)\n return self._load_single_input(path, context)\n else:\n asset_partition_keys = context.asset_partition_keys\n if len(asset_partition_keys) == 0:\n return None\n elif len(asset_partition_keys) == 1:\n paths = self._get_paths_for_partitions(context)\n check.invariant(len(paths) == 1, f"Expected 1 path, but got {len(paths)}")\n path = next(iter(paths.values()))\n backcompat_paths = self._get_multipartition_backcompat_paths(context)\n backcompat_path = (\n None if not backcompat_paths else next(iter(backcompat_paths.values()))\n )\n\n return self._load_single_input(path, context, backcompat_path)\n else: # we are dealing with multiple partitions of an asset\n type_annotation = context.dagster_type.typing_type\n if type_annotation != Any and not is_dict_type(type_annotation):\n check.failed(\n "Loading an input that corresponds to multiple partitions, but the"\n " type annotation on the op input is not a dict, Dict, Mapping, or"\n f" Any: is '{type_annotation}'."\n )\n\n return self._load_multiple_inputs(context)\n\n def handle_output(self, context: OutputContext, obj: Any):\n if context.dagster_type.typing_type == type(None):\n check.invariant(\n obj is None,\n "Output had Nothing type or 'None' annotation, but handle_output received"\n f" value that was not None and was of type {type(obj)}.",\n )\n return None\n\n if context.has_asset_partitions:\n paths = self._get_paths_for_partitions(context)\n\n check.invariant(\n len(paths) == 1,\n f"The current IO manager {type(self)} does not support persisting an output"\n " associated with multiple partitions. This error is likely occurring because a"\n " backfill was launched using the 'single run' option. Instead, launch the"\n " backfill with the 'multiple runs' option.",\n )\n\n path = next(iter(paths.values()))\n else:\n path = self._get_path(context)\n self.make_directory(path.parent)\n context.log.debug(self.get_writing_output_log_message(path))\n self.dump_to_path(context=context, obj=obj, path=path)\n\n metadata = {"path": MetadataValue.path(str(path))}\n custom_metadata = self.get_metadata(context=context, obj=obj)\n metadata.update(custom_metadata) # type: ignore\n\n context.add_output_metadata(metadata)
\n\n\ndef is_dict_type(type_obj) -> bool:\n if type_obj == dict:\n return True\n\n if hasattr(type_obj, "__origin__") and type_obj.__origin__ in (dict, Dict, Mapping):\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/storage/upath_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.upath_io_manager"}}, "types": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Iterator, Optional, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param\nfrom dagster._config import ConfigType\nfrom dagster._core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..definitions.resource_requirement import (\n    ResourceRequirement,\n    TypeLoaderResourceRequirement,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.system import (\n        DagsterTypeLoaderContext,\n    )\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n """How to create a runtime value from config data."""\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n type_display_name = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys())):\n yield TypeLoaderResourceRequirement(\n key=resource_key, type_display_name=type_display_name\n )
\n\n\n@experimental_param(param="loader_version")\n@experimental_param(param="external_version_fn")\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: Optional[AbstractSet[str]],\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\nDagsterTypeLoaderFn: TypeAlias = Callable[["DagsterTypeLoaderContext", Any], Any]\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n) -> Callable[[DagsterTypeLoaderFn], DagsterTypeLoaderFromDecorator]:\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster._config import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n assert isinstance(\n config_type, ConfigType\n ), f"{config_schema} could not be resolved to config type"\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func: DagsterTypeLoaderFn) -> DagsterTypeLoaderFromDecorator:\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@dagster_type_loader '{func.__name__}' decorated function does not have required"\n f" positional parameter '{missing_positional}'. @dagster_type_loader decorated"\n " functions should only have keyword arguments that match input names and a first"\n " positional parameter named 'context'."\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/types/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import (\n    AbstractSet as TypingAbstractSet,\n    AnyStr,\n    Iterator as TypingIterator,\n    Mapping,\n    Optional as TypingOptional,\n    Sequence,\n    Type as TypingType,\n    cast,\n)\n\nfrom typing_extensions import get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    Array,\n    ConfigType,\n    Noneable as ConfigNoneable,\n)\nfrom dagster._core.definitions.events import DynamicOutput, Output, TypeCheck\nfrom dagster._core.definitions.metadata import (\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._seven import is_subclass\n\nfrom ..definitions.resource_requirement import (\n    RequiresResources,\n    ResourceRequirement,\n    TypeResourceRequirement,\n)\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader\n\nif t.TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n    from dagster._core.execution.context.system import DagsterTypeLoaderContext, TypeCheckContext\n\nTypeCheckFn = t.Callable[["TypeCheckContext", AnyStr], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType(RequiresResources):\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = t.Any,\n metadata: t.Optional[t.Mapping[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self._description = check.opt_str_param(description, "description")\n self._loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n f"All types must have a valid display name, got None for key {key}",\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self._typing_type = typing_type\n\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n
[docs] @public\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n """Type check the value against the type.\n\n Args:\n context (TypeCheckContext): The context of the type check.\n value (Any): The value to check.\n\n Returns:\n TypeCheck: The result of the type check.\n """\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n f"You have returned {retval!r} of type {type(retval)} from the type "\n f'check function of type "{self.key}". Return value must be instance '\n "of TypeCheck or a bool."\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval
\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n def __hash__(self):\n return hash(self.key)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata(self) -> t.Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def required_resource_keys(self) -> TypingAbstractSet[str]:\n """AbstractSet[str]: Set of resource keys required by the type check function."""\n return self._required_resource_keys\n\n @public\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses."""\n return cast(str, self._name or self.key)\n\n @public\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types."""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n f"unique_name requested but is None for type {self.display_name}",\n )\n return self._name\n\n @public\n @property\n def has_unique_name(self) -> bool:\n """bool: Whether the type has a unique name."""\n return self._name is not None\n\n @public\n @property\n def typing_type(self) -> t.Any:\n """Any: The python typing type for this type."""\n return self._typing_type\n\n @public\n @property\n def loader(self) -> t.Optional[DagsterTypeLoader]:\n """Optional[DagsterTypeLoader]: Loader for this type, if any."""\n return self._loader\n\n @public\n @property\n def description(self) -> t.Optional[str]:\n """Optional[str]: Description of the type, or None if not provided."""\n return self._description\n\n @property\n def inner_types(self) -> t.Sequence["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def type_param_keys(self) -> t.Sequence[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before"\n " calling getter.".format(name=self.display_name)\n )\n\n def get_resource_requirements(\n self, _outer_context: TypingOptional[object] = None\n ) -> TypingIterator[ResourceRequirement]:\n for resource_key in sorted(list(self.required_resource_keys)):\n yield TypeResourceRequirement(key=resource_key, type_display_name=self.display_name)\n if self.loader:\n yield from self.loader.get_resource_requirements(outer_context=self.display_name)
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster._seven import get_arg_names\n\n args = get_arg_names(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n f'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n f'type_check_fn argument on type "{name}" must take 2 arguments, received {len(args)}.'\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n typing_type=type(None),\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description=f"Value must be None, got a {type(value)}",\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type"\n f" {dagster_type_name}, expected value to be of Python type"\n f" {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type")\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type],\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type],\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList: DagsterListApi = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(\n python_type: TypingType[t.Any], dagster_type: DagsterType\n) -> None:\n """Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n "used the Python type as an annotation for one of its arguments or for its return "\n "value before make_python_type_usable_as_dagster_type was called, and we "\n "generated a Dagster type to correspond to it. To override the auto-generated "\n "Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n "definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n "be called once on a python type as it is registering a 1:1 mapping "\n "between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=(\n f"DagsterType created from a type hint for the Python type {qualified_name}"\n ),\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster._utils.typing_api import is_typing_type\n\n from ..definitions.result import MaterializeResult\n from .primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from .python_dict import (\n Dict as DDict,\n PythonDict,\n )\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, DagsterType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n # First, check to see if we're using Dagster's generic output type to do the type catching.\n if is_generic_output_annotation(dagster_type):\n type_args = get_args(dagster_type)\n # If no inner type was provided, forward Any type.\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif is_dynamic_output_annotation(dagster_type):\n dynamic_out_annotation = get_args(dagster_type)[0]\n type_args = get_args(dynamic_out_annotation)\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif dagster_type == MaterializeResult:\n # convert MaterializeResult type annotation to Nothing until returning\n # scalar values via MaterializeResult is supported\n # https://github.com/dagster-io/dagster/issues/16887\n dagster_type = Nothing\n\n # Then, check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError as e:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n ) from e\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is DDict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef is_dynamic_output_annotation(dagster_type: object) -> bool:\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n if dagster_type == DynamicOutput or get_origin(dagster_type) == DynamicOutput:\n raise DagsterInvariantViolationError(\n "Op annotated with return type DynamicOutput. DynamicOutputs can only be returned in"\n " the context of a List. If only one output is needed, use the Output API."\n )\n\n if get_origin(dagster_type) == list and len(get_args(dagster_type)) == 1:\n list_inner_type = get_args(dagster_type)[0]\n return list_inner_type == DynamicOutput or get_origin(list_inner_type) == DynamicOutput\n return False\n\n\ndef is_generic_output_annotation(dagster_type: object) -> bool:\n return dagster_type == Output or get_origin(dagster_type) == Output\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """Resolves a Python type to a Dagster type."""\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(\n node_defs: Sequence["NodeDefinition"],\n) -> Mapping[str, DagsterType]:\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n\n def process_node_def(node_def: "NodeDefinition"):\n input_output_types = list(node_def.all_input_output_types())\n for dagster_type in input_output_types:\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n if isinstance(node_def, GraphDefinition):\n for child_node_def in node_def.node_defs:\n process_node_def(child_node_def)\n\n for node_def in node_defs:\n process_node_def(node_def)\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional: DagsterOptionalApi = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/_core/types/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.decorator

\nfrom typing import TYPE_CHECKING, Callable, Optional, Type, TypeVar, Union, overload\n\nimport dagster._check as check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\nif TYPE_CHECKING:\n    from dagster._core.types.config_schema import DagsterTypeLoader\n\nT_Type = TypeVar("T_Type", bound=Type[object])\n\n\n@overload\ndef usable_as_dagster_type(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    loader: Optional["DagsterTypeLoader"] = ...,\n) -> Callable[[T_Type], T_Type]: ...\n\n\n@overload\ndef usable_as_dagster_type(\n    name: T_Type,\n) -> T_Type: ...\n\n\n
[docs]def usable_as_dagster_type(\n name: Optional[Union[str, T_Type]] = None,\n description: Optional[str] = None,\n loader: Optional["DagsterTypeLoader"] = None,\n) -> Union[T_Type, Callable[[T_Type], T_Type]]:\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n\n Examples:\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n # check for no args, no parens case\n if isinstance(name, type):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n def _with_args(bare_cls: T_Type) -> T_Type:\n check.class_param(bare_cls, "bare_cls")\n new_name = check.opt_str_param(name, "name") if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n ),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/_core/types/decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.decorator"}}}, "_serdes": {"config_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._utils import convert_dagster_submodule_name\nfrom dagster._utils.yaml_utils import load_run_config_yaml\n\nfrom .serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._config.config_schema import UserConfigSchema\n\nT_ConfigurableClass = TypeVar("T_ConfigurableClass")\n\n\nclass ConfigurableClassDataSerializer(NamedTupleSerializer["ConfigurableClassData"]):\n    def after_pack(self, **packed: Any) -> Dict[str, Any]:\n        packed["module_name"] = convert_dagster_submodule_name(packed["module_name"], "public")\n        return packed\n\n\n
[docs]@whitelist_for_serdes(serializer=ConfigurableClassDataSerializer)\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name: str, class_name: str, config_yaml: str):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n convert_dagster_submodule_name(check.str_param(module_name, "module_name"), "private"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self) -> Mapping[str, Any]:\n return check.is_dict(load_run_config_yaml(self.config_yaml), key_type=str)\n\n def info_dict(self) -> Mapping[str, Any]:\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n @overload\n def rehydrate(self, as_type: Type[T_ConfigurableClass]) -> T_ConfigurableClass: ...\n\n @overload\n def rehydrate(self, as_type: None = ...) -> "ConfigurableClass": ...\n\n def rehydrate(\n self, as_type: Optional[Type[T_ConfigurableClass]] = None\n ) -> Union["ConfigurableClass", T_ConfigurableClass]:\n from dagster._config import process_config, resolve_to_config_type\n from dagster._core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, as_type or ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, check.not_none(result.value))
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self) -> Optional[ConfigurableClassData]:\n """Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n @classmethod\n @abstractmethod\n def config_type(cls) -> "UserConfigSchema":\n """Get the config type against which to validate a config yaml fragment.\n\n The only place config values matching this type are used is inside `from_config_value`. This\n is an alternative constructor for a class. It is a common pattern for the config type to\n match constructor arguments, so `from_config_value`\n\n The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n ...\n # We need to raise `NotImplementedError` here because nothing prevents abstract class\n # methods from being called.\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")\n\n @classmethod\n @abstractmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n """Create an instance of the ConfigurableClass from a validated config value.\n\n The config value used here should be derived from the accompanying `inst_data` argument.\n `inst_data` contains the yaml-serialized config-- this must be parsed and\n validated/normalized, then passed to this method for object instantiation. This is done in\n ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster._core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """
\n\n\ndef class_from_code_pointer(module_name: str, class_name: str) -> Type[object]:\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/_serdes/config_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._serdes.config_class"}}, "_utils": {"alabaster_version": "0.7.13", "alert": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.alert

\nimport datetime\nimport smtplib\nimport ssl\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.sensor_definition import DefaultSensorStatus, SensorDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.graph_definition import GraphDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.run_status_sensor_definition import RunFailureSensorContext\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\ndef _default_failure_email_body(context: "RunFailureSensorContext") -> str:\n    from dagster._core.host_representation.external_data import DEFAULT_MODE_NAME\n\n    return "<br>".join(\n        [\n            f"Pipeline {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Mode: {DEFAULT_MODE_NAME}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\ndef _default_failure_email_subject(context) -> str:\n    return f"Dagster Run Failed: {context.pipeline_run.job_name}"\n\n\nEMAIL_MESSAGE = """From: {email_from}\nTo: {email_to}\nMIME-Version: 1.0\nContent-type: text/html\nSubject: {email_subject}\n\n{email_body}\n\n<!-- this ensures Gmail doesn't trim the email -->\n<span style="opacity: 0"> {randomness} </span>\n"""\n\n\ndef send_email_via_ssl(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP_SSL(smtp_host, smtp_port, context=context) as server:\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\ndef send_email_via_starttls(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP(smtp_host, smtp_port) as server:\n        server.starttls(context=context)\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_email_on_run_failure_sensor(\n email_from: str,\n email_password: str,\n email_to: Sequence[str],\n email_body_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_body,\n email_subject_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_subject,\n smtp_host: str = "smtp.gmail.com",\n smtp_type: str = "SSL",\n smtp_port: Optional[int] = None,\n name: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> SensorDefinition:\n """Create a job failure sensor that sends email via the SMTP protocol.\n\n Args:\n email_from (str): The sender email address to send the message from.\n email_password (str): The password of the sender.\n email_to (List[str]): The receipt email addresses to send the message to.\n email_body_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email body you want to send.\n Defaults to the plain text that contains error message, job name, and run ID.\n email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email subject you want to send.\n Defaults to "Dagster Run Failed: <job_name>".\n smtp_host (str): The hostname of the SMTP server. Defaults to "smtp.gmail.com".\n smtp_type (str): The protocol; either "SSL" or "STARTTLS". Defaults to SSL.\n smtp_port (Optional[int]): The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.\n name: (Optional[str]): The name of the sensor. Defaults to "email_on_job_failure".\n webserver_base_url: (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails. To monitor jobs in external repositories,\n use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\n sensor. Defaults to None, which means the alert will be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n Examples:\n .. code-block:: python\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n )\n\n @repository\n def my_repo():\n return [my_job + email_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n email_body_fn=my_message_fn,\n email_subject_fn=lambda _: "Dagster Alert",\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n from dagster._core.definitions.run_status_sensor_definition import (\n RunFailureSensorContext,\n run_failure_sensor,\n )\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n monitored_jobs=jobs,\n default_status=default_status,\n monitor_all_repositories=monitor_all_repositories,\n )\n def email_on_run_failure(context: RunFailureSensorContext):\n email_body = email_body_fn(context)\n if webserver_base_url:\n email_body += (\n f'<p><a href="{webserver_base_url}/runs/{context.dagster_run.run_id}">View in'\n " the Dagster UI</a></p>"\n )\n\n message = EMAIL_MESSAGE.format(\n email_to=",".join(email_to),\n email_from=email_from,\n email_subject=email_subject_fn(context),\n email_body=email_body,\n randomness=datetime.datetime.now(),\n )\n\n if smtp_type == "SSL":\n send_email_via_ssl(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 465\n )\n elif smtp_type == "STARTTLS":\n send_email_via_starttls(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 587\n )\n else:\n raise DagsterInvalidDefinitionError(f'smtp_type "{smtp_type}" is not supported.')\n\n return email_on_run_failure
\n
", "current_page_name": "_modules/dagster/_utils/alert", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.alert"}, "body": "

Source code for dagster._utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nimport time\nfrom collections import OrderedDict\nfrom datetime import timezone\nfrom enum import Enum\nfrom signal import Signals\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Generator,\n    Generic,\n    Hashable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n    overload,\n)\n\nimport packaging.version\nfrom typing_extensions import Literal, TypeAlias, TypeGuard\n\nimport dagster._check as check\nimport dagster._seven as seven\n\nfrom .internal_init import IHasInternalInit as IHasInternalInit\n\nif sys.version_info > (3,):\n    from pathlib import Path\nelse:\n    from pathlib2 import Path\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryDefinition,\n    )\n    from dagster._core.events import DagsterEvent\n\nK = TypeVar("K")\nT = TypeVar("T")\nU = TypeVar("U")\nV = TypeVar("V")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\nPrintFn: TypeAlias = Callable[[Any], None]\n\nSingleInstigatorDebugCrashFlags: TypeAlias = Mapping[str, int]\nDebugCrashFlags: TypeAlias = Mapping[str, SingleInstigatorDebugCrashFlags]\n\n\n# Use this to get the "library version" (pre-1.0 version) from the "core version" (post 1.0\n# version). 16 is from the 0.16.0 that library versions stayed on when core went to 1.0.0.\ndef library_version_from_core_version(core_version: str) -> str:\n    parsed_version = parse_package_version(core_version)\n\n    release = parsed_version.release\n    if release[0] >= 1:\n        library_version = ".".join(["0", str(16 + release[1]), str(release[2])])\n\n        if parsed_version.is_prerelease:\n            library_version = library_version + "".join(\n                [str(pre) for pre in check.not_none(parsed_version.pre)]\n            )\n\n        if parsed_version.is_postrelease:\n            library_version = library_version + "post" + str(parsed_version.post)\n\n        return library_version\n    else:\n        return core_version\n\n\ndef parse_package_version(version_str: str) -> packaging.version.Version:\n    parsed_version = packaging.version.parse(version_str)\n    assert isinstance(parsed_version, packaging.version.Version)\n    return parsed_version\n\n\ndef convert_dagster_submodule_name(name: str, mode: Literal["private", "public"]) -> str:\n    """This function was introduced when all Dagster submodules were marked private by\n    underscore-prefixing the root submodules (e.g. `dagster._core`). The function provides\n    backcompatibility by converting modules between the old and new (i.e. public and private) forms.\n    This is needed when reading older data or communicating with older versions of Dagster.\n    """\n    if mode == "private":\n        return re.sub(r"^dagster\\.([^_])", r"dagster._\\1", name)\n    elif mode == "public":\n        return re.sub(r"^dagster._", "dagster.", name)\n    else:\n        check.failed("`mode` must be 'private' or 'public'")\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test.\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string: str) -> str:\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict: Mapping[T, U]) -> Tuple[T, U]:\n check.mapping_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return next(iter(ddict.items()))\n\n\n@contextlib.contextmanager\ndef pushd(path: str) -> Iterator[str]:\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path: str) -> bool:\n """Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path: str) -> str:\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n return path\n else:\n raise\n\n\ndef hash_collection(\n collection: Union[\n Mapping[Hashable, Any], Sequence[Any], AbstractSet[Any], Tuple[Any, ...], NamedTuple\n ]\n) -> int:\n """Hash a mutable collection or immutable collection containing mutable elements.\n\n This is useful for hashing Dagster-specific NamedTuples that contain mutable lists or dicts.\n The default NamedTuple __hash__ function assumes the contents of the NamedTuple are themselves\n hashable, and will throw an error if they are not. This can occur when trying to e.g. compute a\n cache key for the tuple for use with `lru_cache`.\n\n This alternative implementation will recursively process collection elements to convert basic\n lists and dicts to tuples prior to hashing. It is recommended to cache the result:\n\n Example:\n .. code-block:: python\n\n def __hash__(self):\n if not hasattr(self, '_hash'):\n self._hash = hash_named_tuple(self)\n return self._hash\n """\n assert isinstance(\n collection, (list, dict, set, tuple)\n ), f"Cannot hash collection of type {type(collection)}"\n return hash(make_hashable(collection))\n\n\n@overload\ndef make_hashable(value: Union[List[Any], Set[Any]]) -> Tuple[Any, ...]: ...\n\n\n@overload\ndef make_hashable(value: Dict[Any, Any]) -> Tuple[Tuple[Any, Any]]: ...\n\n\n@overload\ndef make_hashable(value: Any) -> Any: ...\n\n\ndef make_hashable(value: Any) -> Any:\n if isinstance(value, dict):\n return tuple(sorted((key, make_hashable(value)) for key, value in value.items()))\n elif isinstance(value, (list, tuple, set)):\n return tuple([make_hashable(x) for x in value])\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_job(path, pipeline_fn_name, env_file=None):\n from dagster._core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # noqa: T201\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n path = None\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if path is not None and os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(\n thing_or_gen: Union[T, Iterator[T], Generator[T, Any, Any]]\n) -> Generator[T, Any, Any]:\n if not inspect.isgenerator(thing_or_gen):\n thing_or_gen = cast(T, thing_or_gen)\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path: str) -> str:\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n return file_path\n\n\ndef ensure_file(path: str) -> str:\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n return path\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a", encoding="utf8"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if seven.IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager[Any]], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt: datetime.datetime) -> float:\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\nT_GeneratedContext = TypeVar("T_GeneratedContext")\n\n\nclass EventGenerationManager(Generic[T_GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Iterator[Union["DagsterEvent", T_GeneratedContext]],\n object_cls: Type[T_GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[T_GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[T_GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n f"generator never yielded object of type {self.object_cls.__name__}",\n )\n\n def get_object(self) -> T_GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(T_GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp: float) -> datetime.datetime:\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt: datetime.datetime) -> datetime.datetime:\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value: object) -> bool:\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root() -> str:\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault() -> None:\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port() -> int:\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\ndef is_port_in_use(host, port) -> bool:\n # Similar to the socket options that uvicorn uses to bind ports:\n # https://github.com/encode/uvicorn/blob/62f19c1c39929c84968712c371c9b7b96a041dec/uvicorn/config.py#L565-L566\n sock = socket.socket(family=socket.AF_INET)\n sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n try:\n sock.bind((host, port))\n return False\n except socket.error as e:\n return e.errno == errno.EADDRINUSE\n finally:\n sock.close()\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add: Sequence[str], to_remove: Sequence[str]) -> Iterator[None]:\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules() -> Iterator[None]:\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid: int) -> bool:\n if seven.IS_WINDOWS:\n import psutil\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).""" # noqa: D402\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> Mapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\nT_Callable = TypeVar("T_Callable", bound=Callable)\n\n\ndef traced(func: T_Callable) -> T_Callable:\n """A decorator that keeps track of how many times a function is called."""\n\n @functools.wraps(func)\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return cast(T_Callable, inner)\n\n\ndef get_terminate_signal():\n if sys.platform == "win32":\n return signal.SIGTERM\n return signal.SIGKILL\n\n\ndef get_run_crash_explanation(prefix: str, exit_code: int):\n # As per https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess.returncode\n # negative exit code means a posix signal\n if exit_code < 0 and -exit_code in [signal.value for signal in Signals]:\n posix_signal = -exit_code\n signal_str = Signals(posix_signal).name\n exit_clause = f"was terminated by signal {posix_signal} ({signal_str})."\n if posix_signal == get_terminate_signal():\n exit_clause = (\n exit_clause\n + " This usually indicates that the process was"\n " killed by the operating system due to running out of"\n " memory. Possible solutions include increasing the"\n " amount of memory available to the run, reducing"\n " the amount of memory used by the ops in the run, or"\n " configuring the executor to run fewer ops concurrently."\n )\n else:\n exit_clause = f"unexpectedly exited with code {exit_code}."\n\n return prefix + " " + exit_clause\n\n\ndef last_file_comp(path: str) -> str:\n return os.path.basename(os.path.normpath(path))\n\n\ndef is_named_tuple_instance(obj: object) -> TypeGuard[NamedTuple]:\n return isinstance(obj, tuple) and hasattr(obj, "_fields")\n\n\ndef is_named_tuple_subclass(klass: Type[object]) -> TypeGuard[Type[NamedTuple]]:\n return isinstance(klass, type) and issubclass(klass, tuple) and hasattr(klass, "_fields")\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[True] = ...,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[False] = ...,\n) -> Optional["RepositoryDefinition"]: ...\n\n\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = None,\n repository: Optional["RepositoryDefinition"] = None,\n error_on_none: bool = True,\n) -> Optional["RepositoryDefinition"]:\n """Normalizes the arguments that take a RepositoryDefinition or Definitions object to a\n RepositoryDefinition.\n\n This is intended to handle both the case where a single argument takes a\n `Union[RepositoryDefinition, Definitions]` or separate keyword arguments accept\n `RepositoryDefinition` or `Definitions`.\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n if (definitions_or_repository and repository) or (\n error_on_none and not (definitions_or_repository or repository)\n ):\n check.failed("Exactly one of `definitions` or `repository_def` must be provided.")\n elif isinstance(definitions_or_repository, Definitions):\n return definitions_or_repository.get_repository_def()\n elif definitions_or_repository:\n return definitions_or_repository\n elif repository:\n return repository\n else:\n return None\n\n\ndef xor(a, b):\n return bool(a) != bool(b)\n\n\ndef tail_file(path_or_fd: Union[str, int], should_stop: Callable[[], bool]) -> Iterator[str]:\n with open(path_or_fd, "r") as output_stream:\n while True:\n line = output_stream.readline()\n if line:\n yield line\n elif should_stop():\n break\n else:\n time.sleep(0.01)\n
", "current_page_name": "_modules/dagster/_utils", "customsidebar": null, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.dagster_type

\nfrom typing import Any\n\nfrom dagster._core.definitions.events import Failure, TypeCheck\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.api import create_execution_plan\nfrom dagster._core.execution.context_creation_job import scoped_job_context\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.types.dagster_type import resolve_dagster_type\n\nfrom .typing_api import is_typing_type\n\n\n
[docs]def check_dagster_type(dagster_type: Any, value: Any) -> TypeCheck:\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n f"Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n\n job = InMemoryJob(GraphDefinition(node_defs=[], name="empty").to_job())\n job_def = job.get_definition()\n\n instance = DagsterInstance.ephemeral()\n execution_plan = create_execution_plan(job)\n dagster_run = instance.create_run_for_job(job_def)\n with scoped_job_context(execution_plan, job, {}, dagster_run, instance) as context:\n type_check_context = context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(type_check_context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n
", "current_page_name": "_modules/dagster/_utils/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.dagster_type"}, "favicon_url": null, "forked_pdb": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child.\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster._utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin", encoding="utf8")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/_utils/forked_pdb", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.forked_pdb"}, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom typing import Mapping, NamedTuple, Optional\n\nimport coloredlogs\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import deprecated\nfrom dagster._config import Enum, EnumValue\nfrom dagster._core.definitions.logger_definition import logger\nfrom dagster._core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path: str):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Mapping[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Mapping[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.mapping_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path: str, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,  # type: ignore\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@deprecated(\n breaking_version="2.0",\n subject="loggers.dagit",\n emit_runtime_warning=False,\n)\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": log_level,\n },\n # Only one of dagster or dagster-webserver will be used at a time. We configure them\n # both here to avoid a dependency on the dagster-webserver package.\n "dagit": {\n "handlers": [handler],\n "level": log_level,\n },\n "dagster-webserver": {\n "handlers": [handler],\n "level": log_level,\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n\ndef create_console_logger(name, level):\n klass = logging.getLoggerClass()\n handler = klass(name, level=level)\n coloredlogs.install(\n logger=handler,\n level=level,\n fmt=default_format_string(),\n datefmt=default_date_format_string(),\n field_styles={"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n level_styles={"debug": {}, "error": {"color": "red"}},\n )\n return handler\n
", "current_page_name": "_modules/dagster/_utils/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.log"}, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils", "warnings": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.warnings

\nimport warnings\nfrom contextlib import contextmanager\nfrom typing import Callable, Iterator, Optional, TypeVar\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import (\n    Decoratable,\n    apply_context_manager_decorator,\n)\n\nT = TypeVar("T")\n\n# ########################\n# ##### DEPRECATED\n# ########################\n\n\ndef normalize_renamed_param(\n    new_val: T,\n    new_arg: str,\n    old_val: T,\n    old_arg: str,\n    coerce_old_to_new: Optional[Callable[[T], T]] = None,\n) -> T:\n    """Utility for managing backwards compatibility of a renamed parameter.\n\n    .. code-block::\n\n       # The name of param `old_flag` is being updated to `new_flag`, but we are temporarily\n       # accepting either param.\n       def is_new(old_flag=None, new_flag=None):\n           return canonicalize_backcompat_args(\n               new_val=new_flag,\n               new_arg='new_flag',\n               old_val=old_flag,\n               old_arg='old_flag',\n               breaking_version='0.9.0',\n               coerce_old_to_new=lambda val: not val,\n           )\n\n    In the above example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets the new_flag, it's returned unaltered. If the caller sets\n    old_flag, it will return the old_flag run through the coercion function.\n    """\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    if new_val is not None and old_val is not None:\n        check.failed(f'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".')\n    elif old_val is not None:\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n    else:\n        return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_text: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_text) if additional_warn_text else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n# ########################\n# ##### EXPERIMENTAL\n# ########################\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_warning(\n subject: str, additional_warn_text: Optional[str] = None, stacklevel: int = 3\n) -> None:\n extra_text = f" {additional_warn_text}" if additional_warn_text else ""\n warnings.warn(\n f"{subject} is experimental. It may break in future versions, even between dot"\n f" releases.{extra_text} {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\n# ########################\n# ##### DISABLE DAGSTER WARNINGS\n# ########################\n\n\n@contextmanager\ndef disable_dagster_warnings() -> Iterator[None]:\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=DeprecationWarning)\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n yield\n\n\nT_Decoratable = TypeVar("T_Decoratable", bound=Decoratable)\n\n\ndef suppress_dagster_warnings(__obj: T_Decoratable) -> T_Decoratable:\n """Mark a method/function as ignoring Dagster-generated warnings. This suppresses any\n `ExperimentalWarnings` or `DeprecationWarnings` when the function is called.\n\n Usage:\n\n .. code-block:: python\n\n @suppress_dagster_warnings\n def invokes_some_experimental_stuff(my_arg):\n my_experimental_function(my_arg)\n """\n return apply_context_manager_decorator(__obj, disable_dagster_warnings)\n
", "current_page_name": "_modules/dagster/_utils/warnings", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.warnings"}}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.asset_defs

\nimport hashlib\nimport inspect\nimport os\nimport re\nfrom abc import abstractmethod\nfrom functools import partial\nfrom itertools import chain\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    Nothing,\n    Output,\n    ResourceDefinition,\n    SourceAsset,\n    _check as check,\n)\nfrom dagster._core.definitions import AssetsDefinition, multi_asset\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue\nfrom dagster._core.definitions.metadata.table import TableSchema\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource\nfrom dagster_airbyte.types import AirbyteTableMetadata\nfrom dagster_airbyte.utils import (\n    generate_materializations,\n    generate_table_schema,\n    is_basic_normalization_operation,\n)\n\n\ndef _table_to_output_name_fn(table: str) -> str:\n    return table.replace("-", "_")\n\n\ndef _build_airbyte_asset_defn_metadata(\n    connection_id: str,\n    destination_tables: Sequence[str],\n    table_to_asset_key_fn: Callable[[str], AssetKey],\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n    upstream_assets: Optional[Iterable[AssetKey]] = None,\n    group_name: Optional[str] = None,\n    io_manager_key: Optional[str] = None,\n    schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n    freshness_policy: Optional[FreshnessPolicy] = None,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n) -> AssetsDefinitionCacheableData:\n    asset_key_prefix = (\n        check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str) or []\n    )\n\n    # Generate a list of outputs, the set of destination tables plus any affiliated\n    # normalization tables\n    tables = list(\n        chain.from_iterable(\n            chain(\n                [destination_tables], normalization_tables.values() if normalization_tables else []\n            )\n        )\n    )\n\n    outputs = {\n        _table_to_output_name_fn(table): AssetKey(\n            [*asset_key_prefix, *table_to_asset_key_fn(table).path]\n        )\n        for table in tables\n    }\n\n    internal_deps: Dict[str, Set[AssetKey]] = {}\n\n    metadata_encodable_normalization_tables = (\n        {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}\n    )\n\n    # If normalization tables are specified, we need to add a dependency from the destination table\n    # to the affilitated normalization table\n    if len(metadata_encodable_normalization_tables) > 0:\n        for base_table, derived_tables in metadata_encodable_normalization_tables.items():\n            for derived_table in derived_tables:\n                internal_deps[derived_table] = {\n                    AssetKey([*asset_key_prefix, *table_to_asset_key_fn(base_table).path])\n                }\n\n    # All non-normalization tables depend on any user-provided upstream assets\n    for table in destination_tables:\n        internal_deps[table] = set(upstream_assets or [])\n\n    return AssetsDefinitionCacheableData(\n        keys_by_input_name=(\n            {asset_key.path[-1]: asset_key for asset_key in upstream_assets}\n            if upstream_assets\n            else {}\n        ),\n        keys_by_output_name=outputs,\n        internal_asset_deps=internal_deps,\n        group_name=group_name,\n        key_prefix=asset_key_prefix,\n        can_subset=False,\n        metadata_by_output_name=(\n            {\n                table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n                for table in tables\n            }\n            if schema_by_table_name\n            else None\n        ),\n        freshness_policies_by_output_name=(\n            {output: freshness_policy for output in outputs} if freshness_policy else None\n        ),\n        auto_materialize_policies_by_output_name=(\n            {output: auto_materialize_policy for output in outputs}\n            if auto_materialize_policy\n            else None\n        ),\n        extra_metadata={\n            "connection_id": connection_id,\n            "group_name": group_name,\n            "destination_tables": destination_tables,\n            "normalization_tables": metadata_encodable_normalization_tables,\n            "io_manager_key": io_manager_key,\n        },\n    )\n\n\ndef _build_airbyte_assets_from_metadata(\n    assets_defn_meta: AssetsDefinitionCacheableData,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]],\n) -> AssetsDefinition:\n    metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n    connection_id = cast(str, metadata["connection_id"])\n    group_name = cast(Optional[str], metadata["group_name"])\n    destination_tables = cast(List[str], metadata["destination_tables"])\n    normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])\n    io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n    @multi_asset(\n        name=f"airbyte_sync_{connection_id[:5]}",\n        deps=list((assets_defn_meta.keys_by_input_name or {}).values()),\n        outs={\n            k: AssetOut(\n                key=v,\n                metadata=(\n                    {\n                        k: cast(TableSchemaMetadataValue, v)\n                        for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()\n                    }\n                    if assets_defn_meta.metadata_by_output_name\n                    else None\n                ),\n                io_manager_key=io_manager_key,\n                freshness_policy=(\n                    assets_defn_meta.freshness_policies_by_output_name.get(k)\n                    if assets_defn_meta.freshness_policies_by_output_name\n                    else None\n                ),\n                dagster_type=Nothing,\n            )\n            for k, v in (assets_defn_meta.keys_by_output_name or {}).items()\n        },\n        internal_asset_deps={\n            k: set(v) for k, v in (assets_defn_meta.internal_asset_deps or {}).items()\n        },\n        compute_kind="airbyte",\n        group_name=group_name,\n        resource_defs=resource_defs,\n    )\n    def _assets(context, airbyte: AirbyteResource):\n        ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n        for materialization in generate_materializations(\n            ab_output, assets_defn_meta.key_prefix or []\n        ):\n            table_name = materialization.asset_key.path[-1]\n            if table_name in destination_tables:\n                yield Output(\n                    value=None,\n                    output_name=_table_to_output_name_fn(table_name),\n                    metadata=materialization.metadata,\n                )\n                # Also materialize any normalization tables affiliated with this destination\n                # e.g. nested objects, lists etc\n                if normalization_tables:\n                    for dependent_table in normalization_tables.get(table_name, set()):\n                        yield Output(\n                            value=None,\n                            output_name=_table_to_output_name_fn(dependent_table),\n                        )\n            else:\n                yield materialization\n\n    return _assets\n\n\n
[docs]def build_airbyte_assets(\n connection_id: str,\n destination_tables: Sequence[str],\n asset_key_prefix: Optional[Sequence[str]] = None,\n group_name: Optional[str] = None,\n normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n upstream_assets: Optional[Set[AssetKey]] = None,\n schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n stream_to_asset_map: Optional[Mapping[str, str]] = None,\n) -> Sequence[AssetsDefinition]:\n """Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's\n normalization feature, you may specify a mapping of destination table to a list of\n derived tables that will be created by the normalization process.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]):\n A list of assets to add as sources.\n upstream_assets (Optional[Set[AssetKey]]): Deprecated, use deps instead. A list of assets to add as sources.\n freshness_policy (Optional[FreshnessPolicy]): A freshness policy to apply to the assets\n stream_to_asset_map (Optional[Mapping[str, str]]): A mapping of an Airbyte stream name to a Dagster asset.\n This allows the use of the "prefix" setting in Airbyte with special characters that aren't valid asset names.\n """\n if upstream_assets is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and upstream_assets to build_airbyte_assets. Use only deps"\n " instead."\n )\n\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n # Generate a list of outputs, the set of destination tables plus any affiliated\n # normalization tables\n tables = chain.from_iterable(\n chain([destination_tables], normalization_tables.values() if normalization_tables else [])\n )\n outputs = {\n table: AssetOut(\n key=AssetKey([*asset_key_prefix, table]),\n metadata=(\n {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n if schema_by_table_name\n else None\n ),\n freshness_policy=freshness_policy,\n )\n for table in tables\n }\n\n internal_deps = {}\n\n # If normalization tables are specified, we need to add a dependency from the destination table\n # to the affilitated normalization table\n if normalization_tables:\n for base_table, derived_tables in normalization_tables.items():\n for derived_table in derived_tables:\n internal_deps[derived_table] = {AssetKey([*asset_key_prefix, base_table])}\n\n upstream_deps = deps\n if upstream_assets is not None:\n upstream_deps = list(upstream_assets)\n\n # All non-normalization tables depend on any user-provided upstream assets\n for table in destination_tables:\n internal_deps[table] = set(upstream_deps) if upstream_deps else set()\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n deps=upstream_deps,\n outs=outputs,\n internal_asset_deps=internal_deps,\n compute_kind="airbyte",\n group_name=group_name,\n )\n def _assets(context, airbyte: BaseAirbyteResource):\n ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n\n # No connection details (e.g. using Airbyte Cloud) means we just assume\n # that the outputs were produced\n if len(ab_output.connection_details) == 0:\n for table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n )\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n for materialization in generate_materializations(\n ab_output, asset_key_prefix, stream_to_asset_map\n ):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n metadata=materialization.metadata,\n )\n # Also materialize any normalization tables affiliated with this destination\n # e.g. nested objects, lists etc\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n yield materialization\n\n return [_assets]
\n\n\ndef _get_schema_types(schema: Mapping[str, Any]) -> Sequence[str]:\n """Given a schema definition, return a list of data types that are valid for this schema."""\n types = schema.get("types") or schema.get("type")\n if not types:\n return []\n if isinstance(types, str):\n return [types]\n return types\n\n\ndef _get_sub_schemas(schema: Mapping[str, Any]) -> Sequence[Mapping[str, Any]]:\n """Returns a list of sub-schema definitions for a given schema. This is used to handle union types."""\n return schema.get("anyOf") or schema.get("oneOf") or [schema]\n\n\ndef _get_normalization_tables_for_schema(\n key: str, schema: Mapping[str, Any], prefix: str = ""\n) -> Mapping[str, AirbyteTableMetadata]:\n """Recursively traverses a schema, returning metadata for the tables that will be created by the Airbyte\n normalization process.\n\n For example, a table `cars` with a nested object field `limited_editions` will produce the tables\n `cars` and `cars_limited_editions`.\n\n For more information on Airbyte's normalization process, see:\n https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting\n """\n out: Dict[str, AirbyteTableMetadata] = {}\n # Object types are broken into a new table, as long as they have children\n\n sub_schemas = _get_sub_schemas(schema)\n\n for sub_schema in sub_schemas:\n schema_types = _get_schema_types(sub_schema)\n if not schema_types:\n continue\n\n if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("properties", {}))\n )\n for k, v in sub_schema["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n # Array types are also broken into a new table\n elif "array" in schema_types:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))\n )\n if sub_schema.get("items", {}).get("properties"):\n for k, v in sub_schema["items"]["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n\n return out\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\nclass AirbyteConnectionMetadata(\n NamedTuple(\n "_AirbyteConnectionMetadata",\n [\n ("name", str),\n ("stream_prefix", str),\n ("has_basic_normalization", bool),\n ("stream_data", List[Mapping[str, Any]]),\n ],\n )\n):\n """Contains information about an Airbyte connection.\n\n Attributes:\n name (str): The name of the connection.\n stream_prefix (str): A prefix to add to all stream names.\n has_basic_normalization (bool): Whether or not the connection has basic normalization enabled.\n stream_data (List[Mapping[str, Any]]): Unparsed list of dicts with information about each stream.\n """\n\n @classmethod\n def from_api_json(\n cls, contents: Mapping[str, Any], operations: Mapping[str, Any]\n ) -> "AirbyteConnectionMetadata":\n return cls(\n name=contents["name"],\n stream_prefix=contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operatorConfiguration", {}))\n for op in operations.get("operations", [])\n ),\n stream_data=contents.get("syncCatalog", {}).get("streams", []),\n )\n\n @classmethod\n def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":\n config_contents = cast(Mapping[str, Any], contents.get("configuration"))\n check.invariant(\n config_contents is not None, "Airbyte connection config is missing 'configuration' key"\n )\n\n return cls(\n name=contents["resource_name"],\n stream_prefix=config_contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operator_configuration", {}))\n for op in config_contents.get("operations", [])\n ),\n stream_data=config_contents.get("sync_catalog", {}).get("streams", []),\n )\n\n def parse_stream_tables(\n self, return_normalization_tables: bool = False\n ) -> Mapping[str, AirbyteTableMetadata]:\n """Parses the stream data and returns a mapping, with keys representing destination\n tables associated with each enabled stream and values representing any affiliated\n tables created by Airbyte's normalization process, if enabled.\n """\n tables: Dict[str, AirbyteTableMetadata] = {}\n\n enabled_streams = [\n stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)\n ]\n\n for stream in enabled_streams:\n name = cast(str, stream.get("stream", {}).get("name"))\n prefixed_name = f"{self.stream_prefix}{name}"\n\n schema = (\n stream["stream"]["json_schema"]\n if "json_schema" in stream["stream"]\n else stream["stream"]["jsonSchema"]\n )\n normalization_tables: Dict[str, AirbyteTableMetadata] = {}\n schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))\n if self.has_basic_normalization and return_normalization_tables:\n for k, v in schema_props.items():\n for normalization_table_name, meta in _get_normalization_tables_for_schema(\n k, v, f"{name}_"\n ).items():\n prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"\n normalization_tables[prefixed_norm_table_name] = meta\n tables[prefixed_name] = AirbyteTableMetadata(\n schema=generate_table_schema(schema_props),\n normalization_tables=normalization_tables,\n )\n\n return tables\n\n\ndef _get_schema_by_table_name(\n stream_table_metadata: Mapping[str, AirbyteTableMetadata]\n) -> Mapping[str, TableSchema]:\n schema_by_base_table_name = [(k, v.schema) for k, v in stream_table_metadata.items()]\n schema_by_normalization_table_name = list(\n chain.from_iterable(\n [\n [\n (k, v.schema)\n for k, v in cast(\n Dict[str, AirbyteTableMetadata], meta.normalization_tables\n ).items()\n ]\n for meta in stream_table_metadata.values()\n ]\n )\n )\n\n return dict(schema_by_normalization_table_name + schema_by_base_table_name)\n\n\nclass AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n self._key_prefix = key_prefix\n self._create_assets_for_normalization_tables = create_assets_for_normalization_tables\n self._connection_to_group_fn = connection_to_group_fn\n self._connection_to_io_manager_key_fn = connection_to_io_manager_key_fn\n self._connection_filter = connection_filter\n self._connection_to_asset_key_fn: Callable[[AirbyteConnectionMetadata, str], AssetKey] = (\n connection_to_asset_key_fn or (lambda _, table: AssetKey(path=[table]))\n )\n self._connection_to_freshness_policy_fn = connection_to_freshness_policy_fn or (\n lambda _: None\n )\n self._connection_to_auto_materialize_policy_fn = (\n connection_to_auto_materialize_policy_fn or (lambda _: None)\n )\n\n contents = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n contents.update(",".join(key_prefix).encode("utf-8"))\n contents.update(str(create_assets_for_normalization_tables).encode("utf-8"))\n if connection_filter:\n contents.update(inspect.getsource(connection_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")\n\n @abstractmethod\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n pass\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connection_id, connection in self._get_connections():\n stream_table_metadata = connection.parse_stream_tables(\n self._create_assets_for_normalization_tables\n )\n schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)\n\n table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)\n asset_data_for_conn = _build_airbyte_asset_defn_metadata(\n connection_id=connection_id,\n destination_tables=list(stream_table_metadata.keys()),\n normalization_tables={\n table: set(metadata.normalization_tables.keys())\n for table, metadata in stream_table_metadata.items()\n },\n asset_key_prefix=self._key_prefix,\n group_name=(\n self._connection_to_group_fn(connection.name)\n if self._connection_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connection_to_io_manager_key_fn(connection.name)\n if self._connection_to_io_manager_key_fn\n else None\n ),\n schema_by_table_name=schema_by_table_name,\n table_to_asset_key_fn=table_to_asset_key,\n freshness_policy=self._connection_to_freshness_policy_fn(connection),\n auto_materialize_policy=self._connection_to_auto_materialize_policy_fn(connection),\n )\n\n asset_defn_data.append(asset_data_for_conn)\n\n return asset_defn_data\n\n def _build_definitions_with_resources(\n self,\n data: Sequence[AssetsDefinitionCacheableData],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n ) -> Sequence[AssetsDefinition]:\n return [_build_airbyte_assets_from_metadata(meta, resource_defs) for meta in data]\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return self._build_definitions_with_resources(data)\n\n\nclass AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: Union[ResourceDefinition, AirbyteResource],\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._airbyte_instance: AirbyteResource = (\n airbyte_resource_def.process_config_and_initialize()\n if isinstance(airbyte_resource_def, AirbyteResource)\n else airbyte_resource_def(build_init_resource_context())\n )\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n workspace_id = self._workspace_id\n if not workspace_id:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})\n ).get("workspaces", []),\n )\n\n check.invariant(len(workspaces) <= 1, "Airbyte instance has more than one workspace")\n check.invariant(len(workspaces) > 0, "Airbyte instance has no workspaces")\n\n workspace_id = workspaces[0].get("workspaceId")\n\n connections = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/connections/list", data={"workspaceId": workspace_id}\n )\n ).get("connections", []),\n )\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n for connection_json in connections:\n connection_id = cast(str, connection_json.get("connectionId"))\n\n operations_json = cast(\n Dict[str, Any],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/operations/list",\n data={"connectionId": connection_id},\n )\n ),\n )\n connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return super()._build_definitions_with_resources(\n data, {"airbyte": self._airbyte_instance.get_resource_definition()}\n )\n\n\nclass AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n project_dir: str,\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_directories: Optional[Sequence[str]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._project_dir = project_dir\n self._connection_directories = connection_directories\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n connections_dir = os.path.join(self._project_dir, "connections")\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n\n connection_directories = self._connection_directories or os.listdir(connections_dir)\n for connection_name in connection_directories:\n connection_dir = os.path.join(connections_dir, connection_name)\n with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:\n connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n if self._workspace_id:\n state_file = f"state_{self._workspace_id}.yaml"\n check.invariant(\n state_file in os.listdir(connection_dir),\n f"Workspace state file {state_file} not found",\n )\n else:\n state_files = [\n filename\n for filename in os.listdir(connection_dir)\n if filename.startswith("state_")\n ]\n check.invariant(\n len(state_files) > 0,\n f"No state files found for connection {connection_name} in {connection_dir}",\n )\n check.invariant(\n len(state_files) <= 1,\n "More than one state file found for connection {} in {}, specify a workspace_id"\n " to disambiguate".format(connection_name, connection_dir),\n )\n state_file = state_files[0]\n\n with open(os.path.join(connection_dir, cast(str, state_file)), encoding="utf-8") as f:\n state = yaml.safe_load(f.read())\n connection_id = state.get("resource_id")\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n\n
[docs]def load_assets_from_airbyte_instance(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\n about defined connections at initialization time, and will error on workspace load if the Airbyte\n instance is not reachable.\n\n Args:\n airbyte (ResourceDefinition): An AirbyteResource configured with the appropriate connection\n details.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspaces exist in your instance.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which takes\n in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function\n which takes in connection metadata and returns a freshness policy for the connection's assets. If None, no freshness policies\n will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]): Optional\n function which takes in connection metadata and returns an auto materialization policy for the connection's assets. If None, no\n auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(\n airbyte_instance,\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(airbyte, AirbyteCloudResource):\n raise DagsterInvalidInvocationError(\n "load_assets_from_airbyte_instance is not yet supported for AirbyteCloudResource"\n )\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteInstanceCacheableAssetsDefinition(\n airbyte_resource_def=airbyte,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n\n\n
[docs]def load_assets_from_airbyte_project(\n project_dir: str,\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_directories: Optional[Sequence[str]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads an Airbyte project into a set of Dagster assets.\n\n Point to the root folder of an Airbyte project synced using the Octavia CLI. For\n more information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.\n\n Args:\n project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,\n and connections folders.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspace state YAMLfiles exist in the project.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which\n takes in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.\n If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\n if the project has many connections or if the connection yaml files are large.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):\n Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.\n If None, no freshness policies will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):\n Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.\n If None, no auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n )\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteYAMLCacheableAssetsDefinition(\n project_dir=project_dir,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_directories=connection_directories,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.asset_defs"}, "managed": {"generated": {"destinations": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.destinations

\n# ruff: noqa: A001, A002\nfrom typing import Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteDestination\n\n\n
[docs]class DynamodbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n dynamodb_table_name_prefix: str,\n dynamodb_region: str,\n access_key_id: str,\n secret_access_key: str,\n dynamodb_endpoint: Optional[str] = None,\n ):\n """Airbyte Destination for Dynamodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb\n\n Args:\n name (str): The name of the destination.\n dynamodb_endpoint (Optional[str]): This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).\n dynamodb_table_name_prefix (str): The prefix to use when naming DynamoDB tables.\n dynamodb_region (str): The region of the DynamoDB.\n access_key_id (str): The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.\n secret_access_key (str): The corresponding secret to the access key id.\n """\n self.dynamodb_endpoint = check.opt_str_param(dynamodb_endpoint, "dynamodb_endpoint")\n self.dynamodb_table_name_prefix = check.str_param(\n dynamodb_table_name_prefix, "dynamodb_table_name_prefix"\n )\n self.dynamodb_region = check.str_param(dynamodb_region, "dynamodb_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n super().__init__("Dynamodb", name)
\n\n\n
[docs]class BigqueryDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_location: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDestination.StandardInserts", "BigqueryDestination.GCSStaging"\n ],\n credentials_json: Optional[str] = None,\n transformation_priority: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_location (str): The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n transformation_priority (Optional[str]): Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default "interactive" value is used if not set explicitly.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_location = check.str_param(dataset_location, "dataset_location")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.transformation_priority = check.opt_str_param(\n transformation_priority, "transformation_priority"\n )\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery", name)
\n\n\n
[docs]class RabbitmqDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n routing_key: str,\n ssl: Optional[bool] = None,\n port: Optional[int] = None,\n virtual_host: Optional[str] = None,\n username: Optional[str] = None,\n password: Optional[str] = None,\n exchange: Optional[str] = None,\n ):\n """Airbyte Destination for Rabbitmq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq\n\n Args:\n name (str): The name of the destination.\n ssl (Optional[bool]): SSL enabled.\n host (str): The RabbitMQ host name.\n port (Optional[int]): The RabbitMQ port.\n virtual_host (Optional[str]): The RabbitMQ virtual host name.\n username (Optional[str]): The username to connect.\n password (Optional[str]): The password to connect.\n exchange (Optional[str]): The exchange name.\n routing_key (str): The routing key.\n """\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.virtual_host = check.opt_str_param(virtual_host, "virtual_host")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.exchange = check.opt_str_param(exchange, "exchange")\n self.routing_key = check.str_param(routing_key, "routing_key")\n super().__init__("Rabbitmq", name)
\n\n\n
[docs]class KvdbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, bucket_id: str, secret_key: str):\n """Airbyte Destination for Kvdb.\n\n Documentation can be found at https://kvdb.io/docs/api/\n\n Args:\n name (str): The name of the destination.\n bucket_id (str): The ID of your KVdb bucket.\n secret_key (str): Your bucket Secret Key.\n """\n self.bucket_id = check.str_param(bucket_id, "bucket_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n super().__init__("Kvdb", name)
\n\n\n
[docs]class ClickhouseDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Destination for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): HTTP port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class AmazonSqsDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n message_delay: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n message_body_key: Optional[str] = None,\n message_group_id: Optional[str] = None,\n ):\n """Airbyte Destination for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n message_delay (Optional[int]): Modify the Message Delay of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for sending messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for sending messages\n message_body_key (Optional[str]): Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.\n message_group_id (Optional[str]): The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.message_delay = check.opt_int_param(message_delay, "message_delay")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n self.message_body_key = check.opt_str_param(message_body_key, "message_body_key")\n self.message_group_id = check.opt_str_param(message_group_id, "message_group_id")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class MariadbColumnstoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mariadb Columnstore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore\n\n Args:\n name (str): The name of the destination.\n host (str): The Hostname of the database.\n port (int): The Port of the database.\n database (str): Name of the database.\n username (str): The Username which is used to access the database.\n password (Optional[str]): The Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mariadb Columnstore", name)
\n\n\n
[docs]class KinesisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n region: str,\n shardCount: int,\n accessKey: str,\n privateKey: str,\n bufferSize: int,\n ):\n """Airbyte Destination for Kinesis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis\n\n Args:\n name (str): The name of the destination.\n endpoint (str): AWS Kinesis endpoint.\n region (str): AWS region. Your account determines the Regions that are available to you.\n shardCount (int): Number of shards to which the data should be streamed.\n accessKey (str): Generate the AWS Access Key for current user.\n privateKey (str): The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a "recovery phrase".\n bufferSize (int): Buffer size for storing kinesis records before being batch streamed.\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.region = check.str_param(region, "region")\n self.shardCount = check.int_param(shardCount, "shardCount")\n self.accessKey = check.str_param(accessKey, "accessKey")\n self.privateKey = check.str_param(privateKey, "privateKey")\n self.bufferSize = check.int_param(bufferSize, "bufferSize")\n super().__init__("Kinesis", name)
\n\n\n
[docs]class AzureBlobStorageDestination(GeneratedAirbyteDestination):\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(self, flattening: str):\n self.format_type = "CSV"\n self.flattening = check.str_param(flattening, "flattening")
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n ):\n self.format_type = "JSONL"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_account_key: str,\n format: Union[\n "AzureBlobStorageDestination.CSVCommaSeparatedValues",\n "AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON",\n ],\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n azure_blob_storage_container_name: Optional[str] = None,\n azure_blob_storage_output_buffer_size: Optional[int] = None,\n ):\n """Airbyte Destination for Azure Blob Storage.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage\n\n Args:\n name (str): The name of the destination.\n azure_blob_storage_endpoint_domain_name (Optional[str]): This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.\n azure_blob_storage_container_name (Optional[str]): The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp\n azure_blob_storage_account_name (str): The account's name of the Azure Blob Storage.\n azure_blob_storage_account_key (str): The Azure blob storage account key.\n azure_blob_storage_output_buffer_size (Optional[int]): The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.\n format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]): Output data format\n """\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_container_name = check.opt_str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_account_key = check.str_param(\n azure_blob_storage_account_key, "azure_blob_storage_account_key"\n )\n self.azure_blob_storage_output_buffer_size = check.opt_int_param(\n azure_blob_storage_output_buffer_size, "azure_blob_storage_output_buffer_size"\n )\n self.format = check.inst_param(\n format,\n "format",\n (\n AzureBlobStorageDestination.CSVCommaSeparatedValues,\n AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n super().__init__("Azure Blob Storage", name)
\n\n\n
[docs]class KafkaDestination(GeneratedAirbyteDestination):\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n bootstrap_servers: str,\n topic_pattern: str,\n protocol: Union[\n "KafkaDestination.PLAINTEXT",\n "KafkaDestination.SASLPLAINTEXT",\n "KafkaDestination.SASLSSL",\n ],\n acks: str,\n enable_idempotence: bool,\n compression_type: str,\n batch_size: int,\n linger_ms: str,\n max_in_flight_requests_per_connection: int,\n client_dns_lookup: str,\n buffer_memory: str,\n max_request_size: int,\n retries: int,\n socket_connection_setup_timeout_ms: str,\n socket_connection_setup_timeout_max_ms: str,\n max_block_ms: str,\n request_timeout_ms: int,\n delivery_timeout_ms: int,\n send_buffer_bytes: int,\n receive_buffer_bytes: int,\n test_topic: Optional[str] = None,\n sync_producer: Optional[bool] = None,\n client_id: Optional[str] = None,\n ):\n """Airbyte Destination for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka\n\n Args:\n name (str): The name of the destination.\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n test_topic (Optional[str]): Topic to test if Airbyte can produce messages.\n sync_producer (Optional[bool]): Wait synchronously until the record has been sent to Kafka.\n protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]): Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n acks (str): The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.\n enable_idempotence (bool): When set to 'true', the producer will ensure that exactly one copy of each message is written in the stream. If 'false', producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.\n compression_type (str): The compression type for all data generated by the producer.\n batch_size (int): The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.\n linger_ms (str): The producer groups together any records that arrive in between request transmissions into a single batched request.\n max_in_flight_requests_per_connection (int): The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.\n client_dns_lookup (str): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n buffer_memory (str): The total bytes of memory the producer can use to buffer records waiting to be sent to the server.\n max_request_size (int): The maximum size of a request in bytes.\n retries (int): Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.\n socket_connection_setup_timeout_ms (str): The amount of time the client will wait for the socket connection to be established.\n socket_connection_setup_timeout_max_ms (str): The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.\n max_block_ms (str): The configuration controls how long the KafkaProducer's send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.\n request_timeout_ms (int): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n delivery_timeout_ms (int): An upper bound on the time to report success or failure after a call to 'send()' returns.\n send_buffer_bytes (int): The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.\n receive_buffer_bytes (int): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n """\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.sync_producer = check.opt_bool_param(sync_producer, "sync_producer")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.acks = check.str_param(acks, "acks")\n self.enable_idempotence = check.bool_param(enable_idempotence, "enable_idempotence")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.batch_size = check.int_param(batch_size, "batch_size")\n self.linger_ms = check.str_param(linger_ms, "linger_ms")\n self.max_in_flight_requests_per_connection = check.int_param(\n max_in_flight_requests_per_connection, "max_in_flight_requests_per_connection"\n )\n self.client_dns_lookup = check.str_param(client_dns_lookup, "client_dns_lookup")\n self.buffer_memory = check.str_param(buffer_memory, "buffer_memory")\n self.max_request_size = check.int_param(max_request_size, "max_request_size")\n self.retries = check.int_param(retries, "retries")\n self.socket_connection_setup_timeout_ms = check.str_param(\n socket_connection_setup_timeout_ms, "socket_connection_setup_timeout_ms"\n )\n self.socket_connection_setup_timeout_max_ms = check.str_param(\n socket_connection_setup_timeout_max_ms, "socket_connection_setup_timeout_max_ms"\n )\n self.max_block_ms = check.str_param(max_block_ms, "max_block_ms")\n self.request_timeout_ms = check.int_param(request_timeout_ms, "request_timeout_ms")\n self.delivery_timeout_ms = check.int_param(delivery_timeout_ms, "delivery_timeout_ms")\n self.send_buffer_bytes = check.int_param(send_buffer_bytes, "send_buffer_bytes")\n self.receive_buffer_bytes = check.int_param(receive_buffer_bytes, "receive_buffer_bytes")\n super().__init__("Kafka", name)
\n\n\n
[docs]class ElasticsearchDestination(GeneratedAirbyteDestination):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchDestination.None_",\n "ElasticsearchDestination.ApiKeySecret",\n "ElasticsearchDestination.UsernamePassword",\n ],\n upsert: Optional[bool] = None,\n ):\n r"""Airbyte Destination for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n upsert (Optional[bool]): If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.\n authenticationMethod (Union[ElasticsearchDestination.None\\\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.upsert = check.opt_bool_param(upsert, "upsert")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchDestination.None_,\n ElasticsearchDestination.ApiKeySecret,\n ElasticsearchDestination.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class MysqlDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mysql", name)
\n\n\n
[docs]class SftpJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n username: str,\n password: str,\n destination_path: str,\n port: Optional[int] = None,\n ):\n """Airbyte Destination for Sftp Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the SFTP server.\n port (Optional[int]): Port of the SFTP server.\n username (str): Username to use to access the SFTP server.\n password (str): Password associated with the username.\n destination_path (str): Path to the directory where json files will be written.\n """\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sftp Json", name)
\n\n\n
[docs]class GcsDestination(GeneratedAirbyteDestination):\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, credential_type: str, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = check.str_param(credential_type, "credential_type")\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self,\n codec: str,\n compression_level: Optional[int] = None,\n include_checksum: Optional[bool] = None,\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "GcsDestination.NoCompression",\n "GcsDestination.Deflate",\n "GcsDestination.Bzip2",\n "GcsDestination.Xz",\n "GcsDestination.Zstandard",\n "GcsDestination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n GcsDestination.NoCompression,\n GcsDestination.Deflate,\n GcsDestination.Bzip2,\n GcsDestination.Xz,\n GcsDestination.Zstandard,\n GcsDestination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n flattening: Optional[str] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.opt_str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n credential: "GcsDestination.HMACKey",\n format: Union[\n "GcsDestination.AvroApacheAvro",\n "GcsDestination.CSVCommaSeparatedValues",\n "GcsDestination.JSONLinesNewlineDelimitedJSON",\n "GcsDestination.ParquetColumnarStorage",\n ],\n gcs_bucket_region: Optional[str] = None,\n ):\n """Airbyte Destination for Gcs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs\n\n Args:\n name (str): The name of the destination.\n gcs_bucket_name (str): You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.\n gcs_bucket_path (str): GCS Bucket Path string Subdirectory under the above bucket to sync the data into.\n gcs_bucket_region (Optional[str]): Select a Region of the GCS Bucket. Read more here.\n credential (GcsDestination.HMACKey): An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.\n format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]): Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.\n """\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.gcs_bucket_region = check.opt_str_param(gcs_bucket_region, "gcs_bucket_region")\n self.credential = check.inst_param(credential, "credential", GcsDestination.HMACKey)\n self.format = check.inst_param(\n format,\n "format",\n (\n GcsDestination.AvroApacheAvro,\n GcsDestination.CSVCommaSeparatedValues,\n GcsDestination.JSONLinesNewlineDelimitedJSON,\n GcsDestination.ParquetColumnarStorage,\n ),\n )\n super().__init__("Gcs", name)
\n\n\n
[docs]class CassandraDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n datacenter: Optional[str] = None,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Cassandra.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Cassandra keyspace to create data in.\n username (str): Username to use to access Cassandra.\n password (str): Password associated with Cassandra.\n address (str): Address to connect to.\n port (int): Port of Cassandra.\n datacenter (Optional[str]): Datacenter of the cassandra cluster.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.datacenter = check.opt_str_param(datacenter, "datacenter")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Cassandra", name)
\n\n\n
[docs]class FireboltDestination(GeneratedAirbyteDestination):\n
[docs] class SQLInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "SQL"
\n\n
[docs] class ExternalTableViaS3:\n
[docs] @public\n def __init__(self, s3_bucket: str, s3_region: str, aws_key_id: str, aws_key_secret: str):\n self.method = "S3"\n self.s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self.s3_region = check.str_param(s3_region, "s3_region")\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_key_secret = check.str_param(aws_key_secret, "aws_key_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n loading_method: Union[\n "FireboltDestination.SQLInserts", "FireboltDestination.ExternalTableViaS3"\n ],\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Destination for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]): Loading method used to select the way data will be uploaded to Firebolt\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3),\n )\n super().__init__("Firebolt", name)
\n\n\n
[docs]class GoogleSheetsDestination(GeneratedAirbyteDestination):\n
[docs] class AuthenticationViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: "GoogleSheetsDestination.AuthenticationViaGoogleOAuth",\n ):\n """Airbyte Destination for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): The link to your spreadsheet. See this guide for more details.\n credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth): Google API Credentials for connecting to Google Sheets and Google Drive APIs\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleSheetsDestination.AuthenticationViaGoogleOAuth\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DatabricksDestination(GeneratedAirbyteDestination):\n
[docs] class AmazonS3:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n s3_access_key_id: str,\n s3_secret_access_key: str,\n file_name_pattern: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.s3_access_key_id = check.str_param(s3_access_key_id, "s3_access_key_id")\n self.s3_secret_access_key = check.str_param(\n s3_secret_access_key, "s3_secret_access_key"\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class AzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n accept_terms: bool,\n databricks_server_hostname: str,\n databricks_http_path: str,\n databricks_personal_access_token: str,\n data_source: Union[\n "DatabricksDestination.AmazonS3", "DatabricksDestination.AzureBlobStorage"\n ],\n databricks_port: Optional[str] = None,\n database_schema: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n """Airbyte Destination for Databricks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks\n\n Args:\n name (str): The name of the destination.\n accept_terms (bool): You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.\n databricks_server_hostname (str): Databricks Cluster Server Hostname.\n databricks_http_path (str): Databricks Cluster HTTP Path.\n databricks_port (Optional[str]): Databricks Cluster Port.\n databricks_personal_access_token (str): Databricks Personal Access Token for making authenticated requests.\n database_schema (Optional[str]): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]): Storage on which the delta lake is built.\n purge_staging_data (Optional[bool]): Default to 'true'. Switch it to 'false' for debugging purpose.\n """\n self.accept_terms = check.bool_param(accept_terms, "accept_terms")\n self.databricks_server_hostname = check.str_param(\n databricks_server_hostname, "databricks_server_hostname"\n )\n self.databricks_http_path = check.str_param(databricks_http_path, "databricks_http_path")\n self.databricks_port = check.opt_str_param(databricks_port, "databricks_port")\n self.databricks_personal_access_token = check.str_param(\n databricks_personal_access_token, "databricks_personal_access_token"\n )\n self.database_schema = check.opt_str_param(database_schema, "database_schema")\n self.data_source = check.inst_param(\n data_source,\n "data_source",\n (DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage),\n )\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n super().__init__("Databricks", name)
\n\n\n
[docs]class BigqueryDenormalizedDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDenormalizedDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDenormalizedDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDenormalizedDestination.StandardInserts",\n "BigqueryDenormalizedDestination.GCSStaging",\n ],\n credentials_json: Optional[str] = None,\n dataset_location: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery Denormalized.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n dataset_location (Optional[str]): The location of the dataset. Warning: Changes made after creation will not be applied. The default "US" value is used if not set explicitly. Read more here.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n BigqueryDenormalizedDestination.StandardInserts,\n BigqueryDenormalizedDestination.GCSStaging,\n ),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.dataset_location = check.opt_str_param(dataset_location, "dataset_location")\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery Denormalized", name)
\n\n\n
[docs]class SqliteDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Sqlite.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sqlite", name)
\n\n\n
[docs]class MongodbDestination(GeneratedAirbyteDestination):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.authorization = "none"
\n\n
[docs] class LoginPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.authorization = "login/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbDestination.StandaloneMongoDbInstance",\n "MongodbDestination.ReplicaSet",\n "MongodbDestination.MongoDBAtlas",\n ],\n database: str,\n auth_type: Union["MongodbDestination.None_", "MongodbDestination.LoginPassword"],\n ):\n r"""Airbyte Destination for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbDestination.StandaloneMongoDbInstance, MongodbDestination.ReplicaSet, MongodbDestination.MongoDBAtlas]): MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): Name of the database.\n auth_type (Union[MongodbDestination.None\\\\_, MongodbDestination.LoginPassword]): Authorization type.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbDestination.StandaloneMongoDbInstance,\n MongodbDestination.ReplicaSet,\n MongodbDestination.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.auth_type = check.inst_param(\n auth_type, "auth_type", (MongodbDestination.None_, MongodbDestination.LoginPassword)\n )\n super().__init__("Mongodb", name)
\n\n\n
[docs]class RocksetDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, api_key: str, workspace: str, api_server: Optional[str] = None):\n """Airbyte Destination for Rockset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset\n\n Args:\n name (str): The name of the destination.\n api_key (str): Rockset api key\n workspace (str): The Rockset workspace in which collections will be created + written to.\n api_server (Optional[str]): Rockset api URL\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.workspace = check.str_param(workspace, "workspace")\n self.api_server = check.opt_str_param(api_server, "api_server")\n super().__init__("Rockset", name)
\n\n\n
[docs]class OracleDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n sid: str,\n username: str,\n encryption: Union[\n "OracleDestination.Unencrypted",\n "OracleDestination.NativeNetworkEncryptionNNE",\n "OracleDestination.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n sid (str): The System Identifier uniquely distinguishes the instance from any other instance on the same computer.\n username (str): The username to access the database. This user must have CREATE USER privileges in the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n schema (Optional[str]): The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is "airbyte". In Oracle, schemas and users are the same thing, so the "user" parameter is used as the login credentials and this is used for the default Airbyte message schema.\n encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.sid = check.str_param(sid, "sid")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.schema = check.opt_str_param(schema, "schema")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleDestination.Unencrypted,\n OracleDestination.NativeNetworkEncryptionNNE,\n OracleDestination.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class CsvDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Csv.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where csv files will be written. The destination uses the local mount "/local" and any data files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Csv", name)
\n\n\n
[docs]class S3Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "S3Destination.NoCompression",\n "S3Destination.Deflate",\n "S3Destination.Bzip2",\n "S3Destination.Xz",\n "S3Destination.Zstandard",\n "S3Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n S3Destination.NoCompression,\n S3Destination.Deflate,\n S3Destination.Bzip2,\n S3Destination.Xz,\n S3Destination.Zstandard,\n S3Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n format: Union[\n "S3Destination.AvroApacheAvro",\n "S3Destination.CSVCommaSeparatedValues",\n "S3Destination.JSONLinesNewlineDelimitedJSON",\n "S3Destination.ParquetColumnarStorage",\n ],\n access_key_id: Optional[str] = None,\n secret_access_key: Optional[str] = None,\n s3_endpoint: Optional[str] = None,\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3\n\n Args:\n name (str): The name of the destination.\n access_key_id (Optional[str]): The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (Optional[str]): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the S3 bucket. Read more here.\n s3_bucket_path (str): Directory under the S3 bucket where data will be written. Read more here\n s3_bucket_region (str): The region of the S3 bucket. See here for all region codes.\n format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]): Format of the data output. See here for more details\n s3_endpoint (Optional[str]): Your S3 endpoint url. Read more here\n s3_path_format (Optional[str]): Format string on how data will be organized inside the S3 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the S3 staging file(s)\n """\n self.access_key_id = check.opt_str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.opt_str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.format = check.inst_param(\n format,\n "format",\n (\n S3Destination.AvroApacheAvro,\n S3Destination.CSVCommaSeparatedValues,\n S3Destination.JSONLinesNewlineDelimitedJSON,\n S3Destination.ParquetColumnarStorage,\n ),\n )\n self.s3_endpoint = check.opt_str_param(s3_endpoint, "s3_endpoint")\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("S3", name)
\n\n\n
[docs]class AwsDatalakeDestination(GeneratedAirbyteDestination):\n
[docs] class IAMRole:\n
[docs] @public\n def __init__(self, role_arn: str):\n self.credentials_title = "IAM Role"\n self.role_arn = check.str_param(role_arn, "role_arn")
\n\n
[docs] class IAMUser:\n
[docs] @public\n def __init__(self, aws_access_key_id: str, aws_secret_access_key: str):\n self.credentials_title = "IAM User"\n self.aws_access_key_id = check.str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n credentials: Union["AwsDatalakeDestination.IAMRole", "AwsDatalakeDestination.IAMUser"],\n bucket_name: str,\n bucket_prefix: str,\n aws_account_id: Optional[str] = None,\n lakeformation_database_name: Optional[str] = None,\n ):\n """Airbyte Destination for Aws Datalake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake\n\n Args:\n name (str): The name of the destination.\n aws_account_id (Optional[str]): target aws account id\n region (str): Region name\n credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]): Choose How to Authenticate to AWS.\n bucket_name (str): Name of the bucket\n bucket_prefix (str): S3 prefix\n lakeformation_database_name (Optional[str]): Which database to use\n """\n self.aws_account_id = check.opt_str_param(aws_account_id, "aws_account_id")\n self.region = check.str_param(region, "region")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser),\n )\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.bucket_prefix = check.str_param(bucket_prefix, "bucket_prefix")\n self.lakeformation_database_name = check.opt_str_param(\n lakeformation_database_name, "lakeformation_database_name"\n )\n super().__init__("Aws Datalake", name)
\n\n\n
[docs]class MssqlDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_method: Union[\n "MssqlDestination.Unencrypted",\n "MssqlDestination.EncryptedTrustServerCertificate",\n "MssqlDestination.EncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the MSSQL database.\n port (int): The port of the MSSQL database.\n database (str): The name of the MSSQL database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]): The encryption method which is used to communicate with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlDestination.Unencrypted,\n MssqlDestination.EncryptedTrustServerCertificate,\n MssqlDestination.EncryptedVerifyCertificate,\n ),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class PubsubDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, topic_id: str, credentials_json: str):\n """Airbyte Destination for Pubsub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target PubSub.\n topic_id (str): The PubSub topic ID in the given GCP project ID.\n credentials_json (str): The contents of the JSON service account key. Check out the docs if you need help generating this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.topic_id = check.str_param(topic_id, "topic_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Pubsub", name)
\n\n\n
[docs]class R2Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "R2Destination.NoCompression",\n "R2Destination.Deflate",\n "R2Destination.Bzip2",\n "R2Destination.Xz",\n "R2Destination.Zstandard",\n "R2Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n R2Destination.NoCompression,\n R2Destination.Deflate,\n R2Destination.Bzip2,\n R2Destination.Xz,\n R2Destination.Zstandard,\n R2Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n access_key_id: str,\n secret_access_key: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n format: Union[\n "R2Destination.AvroApacheAvro",\n "R2Destination.CSVCommaSeparatedValues",\n "R2Destination.JSONLinesNewlineDelimitedJSON",\n ],\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for R2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2\n\n Args:\n name (str): The name of the destination.\n account_id (str): Cloudflare account ID\n access_key_id (str): The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (str): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the R2 bucket. Read more here.\n s3_bucket_path (str): Directory under the R2 bucket where data will be written.\n format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]): Format of the data output. See here for more details\n s3_path_format (Optional[str]): Format string on how data will be organized inside the R2 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the R2 staging file(s)\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.format = check.inst_param(\n format,\n "format",\n (\n R2Destination.AvroApacheAvro,\n R2Destination.CSVCommaSeparatedValues,\n R2Destination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("R2", name)
\n\n\n
[docs]class JdbcDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted url. See the standard here.\n schema (Optional[str]): If you leave the schema unspecified, JDBC defaults to a schema named "public".\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.schema = check.opt_str_param(schema, "schema")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class KeenDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, api_key: str, infer_timestamp: Optional[bool] = None\n ):\n """Airbyte Destination for Keen.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen\n\n Args:\n name (str): The name of the destination.\n project_id (str): To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n api_key (str): To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n infer_timestamp (Optional[bool]): Allow connector to guess keen.timestamp value based on the streamed data.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.api_key = check.str_param(api_key, "api_key")\n self.infer_timestamp = check.opt_bool_param(infer_timestamp, "infer_timestamp")\n super().__init__("Keen", name)
\n\n\n
[docs]class TidbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Tidb", name)
\n\n\n
[docs]class FirestoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, credentials_json: Optional[str] = None):\n """Airbyte Destination for Firestore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n super().__init__("Firestore", name)
\n\n\n
[docs]class ScyllaDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Scylla.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Scylla keyspace to create data in.\n username (str): Username to use to access Scylla.\n password (str): Password associated with Scylla.\n address (str): Address to connect to.\n port (int): Port of Scylla.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Scylla", name)
\n\n\n
[docs]class RedisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, host: str, port: int, username: str, password: str, cache_type: str\n ):\n """Airbyte Destination for Redis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis\n\n Args:\n name (str): The name of the destination.\n host (str): Redis host to connect to.\n port (int): Port of Redis.\n username (str): Username associated with Redis.\n password (str): Password associated with Redis.\n cache_type (str): Redis cache type to store data in.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.cache_type = check.str_param(cache_type, "cache_type")\n super().__init__("Redis", name)
\n\n\n
[docs]class MqttDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n broker_host: str,\n broker_port: int,\n use_tls: bool,\n topic_pattern: str,\n publisher_sync: bool,\n connect_timeout: int,\n automatic_reconnect: bool,\n clean_session: bool,\n message_retained: bool,\n message_qos: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n topic_test: Optional[str] = None,\n client: Optional[str] = None,\n ):\n """Airbyte Destination for Mqtt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt\n\n Args:\n name (str): The name of the destination.\n broker_host (str): Host of the broker to connect to.\n broker_port (int): Port of the broker.\n use_tls (bool): Whether to use TLS encryption on the connection.\n username (Optional[str]): User name to use for the connection.\n password (Optional[str]): Password to use for the connection.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n client (Optional[str]): A client identifier that is unique on the server being connected to.\n publisher_sync (bool): Wait synchronously until the record has been sent to the broker.\n connect_timeout (int): Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.\n automatic_reconnect (bool): Whether the client will automatically attempt to reconnect to the server if the connection is lost.\n clean_session (bool): Whether the client and server should remember state across restarts and reconnects.\n message_retained (bool): Whether or not the publish message should be retained by the messaging engine.\n message_qos (str): Quality of service used for each message to be delivered.\n """\n self.broker_host = check.str_param(broker_host, "broker_host")\n self.broker_port = check.int_param(broker_port, "broker_port")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.client = check.opt_str_param(client, "client")\n self.publisher_sync = check.bool_param(publisher_sync, "publisher_sync")\n self.connect_timeout = check.int_param(connect_timeout, "connect_timeout")\n self.automatic_reconnect = check.bool_param(automatic_reconnect, "automatic_reconnect")\n self.clean_session = check.bool_param(clean_session, "clean_session")\n self.message_retained = check.bool_param(message_retained, "message_retained")\n self.message_qos = check.str_param(message_qos, "message_qos")\n super().__init__("Mqtt", name)
\n\n\n
[docs]class RedshiftDestination(GeneratedAirbyteDestination):\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class S3Staging:\n
[docs] @public\n def __init__(\n self,\n s3_bucket_name: str,\n s3_bucket_region: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "RedshiftDestination.NoEncryption", "RedshiftDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_path: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n self.method = "S3 Staging"\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.opt_str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (RedshiftDestination.NoEncryption, RedshiftDestination.AESCBCEnvelopeEncryption),\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n username: str,\n password: str,\n database: str,\n schema: str,\n uploading_method: Union["RedshiftDestination.Standard", "RedshiftDestination.S3Staging"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)\n port (int): Port of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]): The method how the data will be uploaded to the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.uploading_method = check.inst_param(\n uploading_method,\n "uploading_method",\n (RedshiftDestination.Standard, RedshiftDestination.S3Staging),\n )\n super().__init__("Redshift", name)
\n\n\n
[docs]class PulsarDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n brokers: str,\n use_tls: bool,\n topic_type: str,\n topic_tenant: str,\n topic_namespace: str,\n topic_pattern: str,\n compression_type: str,\n send_timeout_ms: int,\n max_pending_messages: int,\n max_pending_messages_across_partitions: int,\n batching_enabled: bool,\n batching_max_messages: int,\n batching_max_publish_delay: int,\n block_if_queue_full: bool,\n topic_test: Optional[str] = None,\n producer_name: Optional[str] = None,\n producer_sync: Optional[bool] = None,\n ):\n """Airbyte Destination for Pulsar.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar\n\n Args:\n name (str): The name of the destination.\n brokers (str): A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.\n use_tls (bool): Whether to use TLS encryption on the connection.\n topic_type (str): It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.\n topic_tenant (str): The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.\n topic_namespace (str): The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n producer_name (Optional[str]): Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.\n producer_sync (Optional[bool]): Wait synchronously until the record has been sent to Pulsar.\n compression_type (str): Compression type for the producer.\n send_timeout_ms (int): If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).\n max_pending_messages (int): The maximum size of a queue holding pending messages.\n max_pending_messages_across_partitions (int): The maximum number of pending messages across partitions.\n batching_enabled (bool): Control whether automatic batching of messages is enabled for the producer.\n batching_max_messages (int): Maximum number of messages permitted in a batch.\n batching_max_publish_delay (int): Time period in milliseconds within which the messages sent will be batched.\n block_if_queue_full (bool): If the send operation should block when the outgoing message queue is full.\n """\n self.brokers = check.str_param(brokers, "brokers")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.topic_type = check.str_param(topic_type, "topic_type")\n self.topic_tenant = check.str_param(topic_tenant, "topic_tenant")\n self.topic_namespace = check.str_param(topic_namespace, "topic_namespace")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.producer_name = check.opt_str_param(producer_name, "producer_name")\n self.producer_sync = check.opt_bool_param(producer_sync, "producer_sync")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.send_timeout_ms = check.int_param(send_timeout_ms, "send_timeout_ms")\n self.max_pending_messages = check.int_param(max_pending_messages, "max_pending_messages")\n self.max_pending_messages_across_partitions = check.int_param(\n max_pending_messages_across_partitions, "max_pending_messages_across_partitions"\n )\n self.batching_enabled = check.bool_param(batching_enabled, "batching_enabled")\n self.batching_max_messages = check.int_param(batching_max_messages, "batching_max_messages")\n self.batching_max_publish_delay = check.int_param(\n batching_max_publish_delay, "batching_max_publish_delay"\n )\n self.block_if_queue_full = check.bool_param(block_if_queue_full, "block_if_queue_full")\n super().__init__("Pulsar", name)
\n\n\n
[docs]class SnowflakeDestination(GeneratedAirbyteDestination):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class KeyPairAuthentication:\n
[docs] @public\n def __init__(\n self,\n private_key: str,\n auth_type: Optional[str] = None,\n private_key_password: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.private_key = check.str_param(private_key, "private_key")\n self.private_key_password = check.opt_str_param(\n private_key_password, "private_key_password"\n )
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, password: str):\n self.password = check.str_param(password, "password")
\n\n
[docs] class SelectAnotherOption:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class RecommendedInternalStaging:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class AWSS3Staging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n s3_bucket_name: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "SnowflakeDestination.NoEncryption", "SnowflakeDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_region: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n file_name_pattern: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_region = check.opt_str_param(s3_bucket_region, "s3_bucket_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (SnowflakeDestination.NoEncryption, SnowflakeDestination.AESCBCEnvelopeEncryption),\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class GoogleCloudStorageStaging:\n
[docs] @public\n def __init__(self, method: str, project_id: str, bucket_name: str, credentials_json: str):\n self.method = check.str_param(method, "method")\n self.project_id = check.str_param(project_id, "project_id")\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] class AzureBlobStorageStaging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n username: str,\n credentials: Union[\n "SnowflakeDestination.OAuth20",\n "SnowflakeDestination.KeyPairAuthentication",\n "SnowflakeDestination.UsernameAndPassword",\n ],\n loading_method: Union[\n "SnowflakeDestination.SelectAnotherOption",\n "SnowflakeDestination.RecommendedInternalStaging",\n "SnowflakeDestination.AWSS3Staging",\n "SnowflakeDestination.GoogleCloudStorageStaging",\n "SnowflakeDestination.AzureBlobStorageStaging",\n ],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): Enter your Snowflake account's locator (in the format ...snowflakecomputing.com)\n role (str): Enter the role that you want to use to access Snowflake\n warehouse (str): Enter the name of the warehouse that you want to sync data into\n database (str): Enter the name of the database you want to sync data into\n schema (str): Enter the name of the default schema\n username (str): Enter the name of the user you want to use to access the database\n jdbc_url_params (Optional[str]): Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3\n loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]): Select a data staging method\n """\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n SnowflakeDestination.OAuth20,\n SnowflakeDestination.KeyPairAuthentication,\n SnowflakeDestination.UsernameAndPassword,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n SnowflakeDestination.SelectAnotherOption,\n SnowflakeDestination.RecommendedInternalStaging,\n SnowflakeDestination.AWSS3Staging,\n SnowflakeDestination.GoogleCloudStorageStaging,\n SnowflakeDestination.AzureBlobStorageStaging,\n ),\n )\n super().__init__("Snowflake", name)
\n\n\n
[docs]class PostgresDestination(GeneratedAirbyteDestination):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(self, ca_certificate: str, client_key_password: Optional[str] = None):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: str,\n client_key: str,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.str_param(client_certificate, "client_certificate")\n self.client_key = check.str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_mode: Union[\n "PostgresDestination.Disable",\n "PostgresDestination.Allow",\n "PostgresDestination.Prefer",\n "PostgresDestination.Require",\n "PostgresDestination.VerifyCa",\n "PostgresDestination.VerifyFull",\n ],\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]): SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresDestination.Disable,\n PostgresDestination.Allow,\n PostgresDestination.Prefer,\n PostgresDestination.Require,\n PostgresDestination.VerifyCa,\n PostgresDestination.VerifyFull,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Postgres", name)
\n\n\n
[docs]class ScaffoldDestinationPythonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, TODO: Optional[str] = None):\n """Airbyte Destination for Scaffold Destination Python.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python\n\n Args:\n name (str): The name of the destination.\n TODO (Optional[str]): FIX ME\n """\n self.TODO = check.opt_str_param(TODO, "TODO")\n super().__init__("Scaffold Destination Python", name)
\n\n\n
[docs]class LocalJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Local Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Local Json", name)
\n\n\n
[docs]class MeilisearchDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, host: str, api_key: Optional[str] = None):\n """Airbyte Destination for Meilisearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the MeiliSearch instance.\n api_key (Optional[str]): MeiliSearch API Key. See the docs for more information on how to obtain this key.\n """\n self.host = check.str_param(host, "host")\n self.api_key = check.opt_str_param(api_key, "api_key")\n super().__init__("Meilisearch", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/destinations", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.destinations"}, "sources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.sources

\n# ruff: noqa: A001, A002\nfrom typing import List, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteSource\n\n\n
[docs]class StravaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n athlete_id: int,\n start_date: str,\n auth_type: Optional[str] = None,\n ):\n """Airbyte Source for Strava.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/strava\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Strava developer application.\n client_secret (str): The Client Secret of your Strava developer application.\n refresh_token (str): The Refresh Token with the activity: read_all permissions.\n athlete_id (int): The Athlete ID of your Strava developer application.\n start_date (str): UTC date and time. Any data before this date will not be replicated.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.athlete_id = check.int_param(athlete_id, "athlete_id")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Strava", name)
\n\n\n
[docs]class AppsflyerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n app_id: str,\n api_token: str,\n start_date: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Appsflyer.\n\n Args:\n name (str): The name of the destination.\n app_id (str): App identifier as found in AppsFlyer.\n api_token (str): Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.\n start_date (str): The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.\n timezone (Optional[str]): Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.\n """\n self.app_id = check.str_param(app_id, "app_id")\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n super().__init__("Appsflyer", name)
\n\n\n
[docs]class GoogleWorkspaceAdminReportsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, credentials_json: str, email: str, lookback: Optional[int] = None\n ):\n """Airbyte Source for Google Workspace Admin Reports.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports\n\n Args:\n name (str): The name of the destination.\n credentials_json (str): The contents of the JSON service account key. See the docs for more information on how to generate this key.\n email (str): The email of the user, who has permissions to access the Google Workspace Admin APIs.\n lookback (Optional[int]): Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.\n """\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")\n self.lookback = check.opt_int_param(lookback, "lookback")\n super().__init__("Google Workspace Admin Reports", name)
\n\n\n
[docs]class CartSource(GeneratedAirbyteSource):\n
[docs] class CentralAPIRouter:\n
[docs] @public\n def __init__(self, user_name: str, user_secret: str, site_id: str):\n self.auth_type = "CENTRAL_API_ROUTER"\n self.user_name = check.str_param(user_name, "user_name")\n self.user_secret = check.str_param(user_secret, "user_secret")\n self.site_id = check.str_param(site_id, "site_id")
\n\n
[docs] class SingleStoreAccessToken:\n
[docs] @public\n def __init__(self, access_token: str, store_name: str):\n self.auth_type = "SINGLE_STORE_ACCESS_TOKEN"\n self.access_token = check.str_param(access_token, "access_token")\n self.store_name = check.str_param(store_name, "store_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["CartSource.CentralAPIRouter", "CartSource.SingleStoreAccessToken"],\n start_date: str,\n ):\n """Airbyte Source for Cart.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cart\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (CartSource.CentralAPIRouter, CartSource.SingleStoreAccessToken),\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Cart", name)
\n\n\n
[docs]class LinkedinAdsSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["LinkedinAdsSource.OAuth20", "LinkedinAdsSource.AccessToken"],\n start_date: str,\n account_ids: Optional[List[int]] = None,\n ):\n """Airbyte Source for Linkedin Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2020-09-17. Any data before this date will not be replicated.\n account_ids (Optional[List[int]]): Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (LinkedinAdsSource.OAuth20, LinkedinAdsSource.AccessToken)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.account_ids = check.opt_nullable_list_param(account_ids, "account_ids", int)\n super().__init__("Linkedin Ads", name)
\n\n\n
[docs]class MongodbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n user: str,\n password: str,\n auth_source: str,\n replica_set: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb\n\n Args:\n name (str): The name of the destination.\n host (str): Host of a Mongo database to be replicated.\n port (int): Port of a Mongo database to be replicated.\n database (str): Database to be replicated.\n user (str): User\n password (str): Password\n auth_source (str): Authentication source where user information is stored. See the Mongo docs for more info.\n replica_set (Optional[str]): The name of the set to filter servers by, when connecting to a replica set (Under this condition, the 'TLS connection' value automatically becomes 'true'). See the Mongo docs for more info.\n ssl (Optional[bool]): If this switch is enabled, TLS connections will be used to connect to MongoDB.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.user = check.str_param(user, "user")\n self.password = check.str_param(password, "password")\n self.auth_source = check.str_param(auth_source, "auth_source")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Mongodb", name)
\n\n\n
[docs]class TimelySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, account_id: str, start_date: str, bearer_token: str):\n """Airbyte Source for Timely.\n\n Args:\n name (str): The name of the destination.\n account_id (str): Timely account id\n start_date (str): start date\n bearer_token (str): Timely bearer token\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.bearer_token = check.str_param(bearer_token, "bearer_token")\n super().__init__("Timely", name)
\n\n\n
[docs]class StockTickerApiTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, stock_ticker: str, api_key: str):\n """Airbyte Source for Stock Ticker Api Tutorial.\n\n Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date\n\n Args:\n name (str): The name of the destination.\n stock_ticker (str): The stock ticker to track\n api_key (str): The Polygon.io Stocks API key to use to hit the API.\n """\n self.stock_ticker = check.str_param(stock_ticker, "stock_ticker")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Stock Ticker Api Tutorial", name)
\n\n\n
[docs]class WrikeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, wrike_instance: str, start_date: Optional[str] = None\n ):\n """Airbyte Source for Wrike.\n\n Args:\n name (str): The name of the destination.\n access_token (str): Permanent access token. You can find documentation on how to acquire a permanent access token here\n wrike_instance (str): Wrike's instance such as `app-us2.wrike.com`\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.wrike_instance = check.str_param(wrike_instance, "wrike_instance")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Wrike", name)
\n\n\n
[docs]class CommercetoolsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n host: str,\n start_date: str,\n project_key: str,\n client_id: str,\n client_secret: str,\n ):\n """Airbyte Source for Commercetools.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools\n\n Args:\n name (str): The name of the destination.\n region (str): The region of the platform.\n host (str): The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n project_key (str): The project key\n client_id (str): Id of API Client.\n client_secret (str): The password of secret of API Client.\n """\n self.region = check.str_param(region, "region")\n self.host = check.str_param(host, "host")\n self.start_date = check.str_param(start_date, "start_date")\n self.project_key = check.str_param(project_key, "project_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Commercetools", name)
\n\n\n
[docs]class GutendexSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n author_year_start: Optional[str] = None,\n author_year_end: Optional[str] = None,\n copyright: Optional[str] = None,\n languages: Optional[str] = None,\n search: Optional[str] = None,\n sort: Optional[str] = None,\n topic: Optional[str] = None,\n ):\n """Airbyte Source for Gutendex.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex\n\n Args:\n name (str): The name of the destination.\n author_year_start (Optional[str]): (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n author_year_end (Optional[str]): (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n copyright (Optional[str]): (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.\n languages (Optional[str]): (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.\n search (Optional[str]): (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.\n sort (Optional[str]): (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.\n topic (Optional[str]): (Optional) Use this to search for a case-insensitive key-phrase in books' bookshelves or subjects.\n """\n self.author_year_start = check.opt_str_param(author_year_start, "author_year_start")\n self.author_year_end = check.opt_str_param(author_year_end, "author_year_end")\n self.copyright = check.opt_str_param(copyright, "copyright")\n self.languages = check.opt_str_param(languages, "languages")\n self.search = check.opt_str_param(search, "search")\n self.sort = check.opt_str_param(sort, "sort")\n self.topic = check.opt_str_param(topic, "topic")\n super().__init__("Gutendex", name)
\n\n\n
[docs]class IterableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Iterable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable\n\n Args:\n name (str): The name of the destination.\n api_key (str): Iterable API Key. See the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Iterable", name)
\n\n\n
[docs]class QuickbooksSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n realm_id: str,\n user_agent: str,\n start_date: str,\n sandbox: bool,\n ):\n """Airbyte Source for Quickbooks Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks\n\n Args:\n name (str): The name of the destination.\n client_id (str): Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n client_secret (str): Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n refresh_token (str): A token used when refreshing the access token.\n realm_id (str): Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.\n user_agent (str): Process and email for API logging purposes. Example: tap-quickbooks .\n start_date (str): The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.\n sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.realm_id = check.str_param(realm_id, "realm_id")\n self.user_agent = check.str_param(user_agent, "user_agent")\n self.start_date = check.str_param(start_date, "start_date")\n self.sandbox = check.bool_param(sandbox, "sandbox")\n super().__init__("Quickbooks Singer", name)
\n\n\n
[docs]class BigcommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, store_hash: str, access_token: str):\n """Airbyte Source for Bigcommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n store_hash (str): The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store's hash code is 'HASH_CODE'.\n access_token (str): Access Token for making authenticated requests.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.store_hash = check.str_param(store_hash, "store_hash")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Bigcommerce", name)
\n\n\n
[docs]class ShopifySource(GeneratedAirbyteSource):\n
[docs] class APIPassword:\n
[docs] @public\n def __init__(self, api_password: str):\n self.auth_method = "api_password"\n self.api_password = check.str_param(api_password, "api_password")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n credentials: Union["ShopifySource.APIPassword", "ShopifySource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Shopify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be 'NAME'.\n credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]): The authorization method to use to retrieve data from Shopify\n start_date (str): The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.\n """\n self.shop = check.str_param(shop, "shop")\n self.credentials = check.inst_param(\n credentials, "credentials", (ShopifySource.APIPassword, ShopifySource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shopify", name)
\n\n\n
[docs]class AppstoreSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, key_id: str, private_key: str, issuer_id: str, vendor: str, start_date: str\n ):\n """Airbyte Source for Appstore Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore\n\n Args:\n name (str): The name of the destination.\n key_id (str): Appstore Key ID. See the docs for more information on how to obtain this key.\n private_key (str): Appstore Private Key. See the docs for more information on how to obtain this key.\n issuer_id (str): Appstore Issuer ID. See the docs for more information on how to obtain this ID.\n vendor (str): Appstore Vendor ID. See the docs for more information on how to obtain this ID.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.key_id = check.str_param(key_id, "key_id")\n self.private_key = check.str_param(private_key, "private_key")\n self.issuer_id = check.str_param(issuer_id, "issuer_id")\n self.vendor = check.str_param(vendor, "vendor")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Appstore Singer", name)
\n\n\n
[docs]class GreenhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Greenhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse\n\n Args:\n name (str): The name of the destination.\n api_key (str): Greenhouse API Key. See the docs for more information on how to generate this key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Greenhouse", name)
\n\n\n
[docs]class ZoomSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, jwt: str):\n """Airbyte Source for Zoom Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom\n\n Args:\n name (str): The name of the destination.\n jwt (str): Zoom JWT Token. See the docs for more information on how to obtain this key.\n """\n self.jwt = check.str_param(jwt, "jwt")\n super().__init__("Zoom Singer", name)
\n\n\n
[docs]class TiktokMarketingSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self, app_id: str, secret: str, access_token: str, auth_type: Optional[str] = None\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.app_id = check.str_param(app_id, "app_id")\n self.secret = check.str_param(secret, "secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class SandboxAccessToken:\n
[docs] @public\n def __init__(self, advertiser_id: str, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.advertiser_id = check.str_param(advertiser_id, "advertiser_id")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "TiktokMarketingSource.OAuth20", "TiktokMarketingSource.SandboxAccessToken"\n ],\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n report_granularity: Optional[str] = None,\n ):\n """Airbyte Source for Tiktok Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing\n\n Args:\n name (str): The name of the destination.\n credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]): Authentication method\n start_date (Optional[str]): The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.\n report_granularity (Optional[str]): The granularity used for aggregating performance data in reports. See the docs.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken),\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.report_granularity = check.opt_str_param(report_granularity, "report_granularity")\n super().__init__("Tiktok Marketing", name)
\n\n\n
[docs]class ZendeskChatSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.credentials = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["ZendeskChatSource.OAuth20", "ZendeskChatSource.AccessToken"],\n subdomain: Optional[str] = None,\n ):\n """Airbyte Source for Zendesk Chat.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.\n subdomain (Optional[str]): Required if you access Zendesk Chat from a Zendesk Support subdomain.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskChatSource.OAuth20, ZendeskChatSource.AccessToken)\n )\n super().__init__("Zendesk Chat", name)
\n\n\n
[docs]class AwsCloudtrailSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, aws_key_id: str, aws_secret_key: str, aws_region_name: str, start_date: str\n ):\n """Airbyte Source for Aws Cloudtrail.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail\n\n Args:\n name (str): The name of the destination.\n aws_key_id (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_secret_key (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_region_name (str): The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.\n start_date (str): The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.\n """\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.aws_region_name = check.str_param(aws_region_name, "aws_region_name")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Aws Cloudtrail", name)
\n\n\n
[docs]class OktaSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["OktaSource.OAuth20", "OktaSource.APIToken"],\n domain: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Okta.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/okta\n\n Args:\n name (str): The name of the destination.\n domain (Optional[str]): The Okta domain. See the docs for instructions on how to find it.\n start_date (Optional[str]): UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.\n """\n self.domain = check.opt_str_param(domain, "domain")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (OktaSource.OAuth20, OktaSource.APIToken)\n )\n super().__init__("Okta", name)
\n\n\n
[docs]class InsightlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: Optional[str] = None, start_date: Optional[str] = None):\n """Airbyte Source for Insightly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly\n\n Args:\n name (str): The name of the destination.\n token (Optional[str]): Your Insightly API token.\n start_date (Optional[str]): The date from which you'd like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.\n """\n self.token = check.opt_str_param(token, "token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Insightly", name)
\n\n\n
[docs]class LinkedinPagesSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n org_id: int,\n credentials: Union["LinkedinPagesSource.OAuth20", "LinkedinPagesSource.AccessToken"],\n ):\n """Airbyte Source for Linkedin Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/\n\n Args:\n name (str): The name of the destination.\n org_id (int): Specify the Organization ID\n """\n self.org_id = check.int_param(org_id, "org_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (LinkedinPagesSource.OAuth20, LinkedinPagesSource.AccessToken),\n )\n super().__init__("Linkedin Pages", name)
\n\n\n
[docs]class PersistiqSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Persistiq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq\n\n Args:\n name (str): The name of the destination.\n api_key (str): PersistIq API Key. See the docs for more information on where to find that key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Persistiq", name)
\n\n\n
[docs]class FreshcallerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n start_date: str,\n requests_per_minute: Optional[int] = None,\n sync_lag_minutes: Optional[int] = None,\n ):\n """Airbyte Source for Freshcaller.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller\n\n Args:\n name (str): The name of the destination.\n domain (str): Used to construct Base URL for the Freshcaller APIs\n api_key (str): Freshcaller API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (str): UTC date and time. Any data created after this date will be replicated.\n sync_lag_minutes (Optional[int]): Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.str_param(start_date, "start_date")\n self.sync_lag_minutes = check.opt_int_param(sync_lag_minutes, "sync_lag_minutes")\n super().__init__("Freshcaller", name)
\n\n\n
[docs]class AppfollowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, ext_id: str, cid: str, api_secret: str, country: str):\n """Airbyte Source for Appfollow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow\n\n Args:\n name (str): The name of the destination.\n ext_id (str): for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;\n cid (str): client id provided by Appfollow\n api_secret (str): api secret provided by Appfollow\n country (str): getting data by Country\n """\n self.ext_id = check.str_param(ext_id, "ext_id")\n self.cid = check.str_param(cid, "cid")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.country = check.str_param(country, "country")\n super().__init__("Appfollow", name)
\n\n\n
[docs]class FacebookPagesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, access_token: str, page_id: str):\n """Airbyte Source for Facebook Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages\n\n Args:\n name (str): The name of the destination.\n access_token (str): Facebook Page Access Token\n page_id (str): Page ID\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.page_id = check.str_param(page_id, "page_id")\n super().__init__("Facebook Pages", name)
\n\n\n
[docs]class JiraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n domain: str,\n email: str,\n projects: Optional[List[str]] = None,\n start_date: Optional[str] = None,\n additional_fields: Optional[List[str]] = None,\n expand_issue_changelog: Optional[bool] = None,\n render_fields: Optional[bool] = None,\n enable_experimental_streams: Optional[bool] = None,\n ):\n """Airbyte Source for Jira.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/jira\n\n Args:\n name (str): The name of the destination.\n api_token (str): Jira API Token. See the docs for more information on how to generate this key.\n domain (str): The Domain for your Jira account, e.g. airbyteio.atlassian.net\n email (str): The user email for your Jira account.\n projects (Optional[List[str]]): List of Jira project keys to replicate data for.\n start_date (Optional[str]): The date from which you'd like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.\n additional_fields (Optional[List[str]]): List of additional fields to include in replicating issues.\n expand_issue_changelog (Optional[bool]): Expand the changelog when replicating issues.\n render_fields (Optional[bool]): Render issue fields in HTML format in addition to Jira JSON-like format.\n enable_experimental_streams (Optional[bool]): Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain = check.str_param(domain, "domain")\n self.email = check.str_param(email, "email")\n self.projects = check.opt_nullable_list_param(projects, "projects", str)\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.additional_fields = check.opt_nullable_list_param(\n additional_fields, "additional_fields", str\n )\n self.expand_issue_changelog = check.opt_bool_param(\n expand_issue_changelog, "expand_issue_changelog"\n )\n self.render_fields = check.opt_bool_param(render_fields, "render_fields")\n self.enable_experimental_streams = check.opt_bool_param(\n enable_experimental_streams, "enable_experimental_streams"\n )\n super().__init__("Jira", name)
\n\n\n
[docs]class GoogleSheetsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: Union[\n "GoogleSheetsSource.AuthenticateViaGoogleOAuth",\n "GoogleSheetsSource.ServiceAccountKeyAuthentication",\n ],\n row_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): Enter the link to the Google spreadsheet you want to sync\n row_batch_size (Optional[int]): Number of rows fetched when making a Google Sheet API call. Defaults to 200.\n credentials (Union[GoogleSheetsSource.AuthenticateViaGoogleOAuth, GoogleSheetsSource.ServiceAccountKeyAuthentication]): Credentials for connecting to the Google Sheets API\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.row_batch_size = check.opt_int_param(row_batch_size, "row_batch_size")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleSheetsSource.AuthenticateViaGoogleOAuth,\n GoogleSheetsSource.ServiceAccountKeyAuthentication,\n ),\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DockerhubSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, docker_username: str):\n """Airbyte Source for Dockerhub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub\n\n Args:\n name (str): The name of the destination.\n docker_username (str): Username of DockerHub person or organization (for https://hub.docker.com/v2/repositories/USERNAME/ API call)\n """\n self.docker_username = check.str_param(docker_username, "docker_username")\n super().__init__("Dockerhub", name)
\n\n\n
[docs]class UsCensusSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, query_path: str, api_key: str, query_params: Optional[str] = None\n ):\n """Airbyte Source for Us Census.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census\n\n Args:\n name (str): The name of the destination.\n query_params (Optional[str]): The query parameters portion of the GET request, without the api key\n query_path (str): The path portion of the GET request\n api_key (str): Your API Key. Get your key here.\n """\n self.query_params = check.opt_str_param(query_params, "query_params")\n self.query_path = check.str_param(query_path, "query_path")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Us Census", name)
\n\n\n
[docs]class KustomerSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, start_date: str):\n """Airbyte Source for Kustomer Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer\n\n Args:\n name (str): The name of the destination.\n api_token (str): Kustomer API Token. See the docs on how to obtain this\n start_date (str): The date from which you'd like to replicate the data\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Kustomer Singer", name)
\n\n\n
[docs]class AzureTableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n storage_account_name: str,\n storage_access_key: str,\n storage_endpoint_suffix: Optional[str] = None,\n ):\n """Airbyte Source for Azure Table.\n\n Args:\n name (str): The name of the destination.\n storage_account_name (str): The name of your storage account.\n storage_access_key (str): Azure Table Storage Access Key. See the docs for more information on how to obtain this key.\n storage_endpoint_suffix (Optional[str]): Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix\n """\n self.storage_account_name = check.str_param(storage_account_name, "storage_account_name")\n self.storage_access_key = check.str_param(storage_access_key, "storage_access_key")\n self.storage_endpoint_suffix = check.opt_str_param(\n storage_endpoint_suffix, "storage_endpoint_suffix"\n )\n super().__init__("Azure Table", name)
\n\n\n
[docs]class ScaffoldJavaJdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n replication_method: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Scaffold Java Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n replication_method (str): Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.replication_method = check.str_param(replication_method, "replication_method")\n super().__init__("Scaffold Java Jdbc", name)
\n\n\n
[docs]class TidbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Tidb", name)
\n\n\n
[docs]class QualarooSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n survey_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Qualaroo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo\n\n Args:\n name (str): The name of the destination.\n token (str): A Qualaroo token. See the docs for instructions on how to generate it.\n key (str): A Qualaroo token. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all surveys to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Qualaroo", name)
\n\n\n
[docs]class YahooFinancePriceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, tickers: str, interval: Optional[str] = None, range: Optional[str] = None\n ):\n """Airbyte Source for Yahoo Finance Price.\n\n Args:\n name (str): The name of the destination.\n tickers (str): Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.\n interval (Optional[str]): The interval of between prices queried.\n range (Optional[str]): The range of prices to be queried.\n """\n self.tickers = check.str_param(tickers, "tickers")\n self.interval = check.opt_str_param(interval, "interval")\n self.range = check.opt_str_param(range, "range")\n super().__init__("Yahoo Finance Price", name)
\n\n\n
[docs]class GoogleAnalyticsV4Source(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication",\n ],\n start_date: str,\n view_id: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics V4.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]): Credentials for the service\n start_date (str): The date in the format YYYY-MM-DD. Any data before this date will not be replicated.\n view_id (str): The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth,\n GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication,\n ),\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.view_id = check.str_param(view_id, "view_id")\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics V4", name)
\n\n\n
[docs]class JdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted URL. See the standard here.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class FakerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n count: int,\n seed: Optional[int] = None,\n records_per_sync: Optional[int] = None,\n records_per_slice: Optional[int] = None,\n ):\n """Airbyte Source for Faker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/faker\n\n Args:\n name (str): The name of the destination.\n count (int): How many users should be generated in total. This setting does not apply to the purchases or products stream.\n seed (Optional[int]): Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)\n records_per_sync (Optional[int]): How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.\n records_per_slice (Optional[int]): How many fake records will be in each page (stream slice), before a state message is emitted?\n """\n self.count = check.int_param(count, "count")\n self.seed = check.opt_int_param(seed, "seed")\n self.records_per_sync = check.opt_int_param(records_per_sync, "records_per_sync")\n self.records_per_slice = check.opt_int_param(records_per_slice, "records_per_slice")\n super().__init__("Faker", name)
\n\n\n
[docs]class TplcentralSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n url_base: str,\n client_id: str,\n client_secret: str,\n user_login_id: Optional[int] = None,\n user_login: Optional[str] = None,\n tpl_key: Optional[str] = None,\n customer_id: Optional[int] = None,\n facility_id: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Tplcentral.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral\n\n Args:\n name (str): The name of the destination.\n user_login_id (Optional[int]): User login ID and/or name is required\n user_login (Optional[str]): User login ID and/or name is required\n start_date (Optional[str]): Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.\n """\n self.url_base = check.str_param(url_base, "url_base")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.user_login_id = check.opt_int_param(user_login_id, "user_login_id")\n self.user_login = check.opt_str_param(user_login, "user_login")\n self.tpl_key = check.opt_str_param(tpl_key, "tpl_key")\n self.customer_id = check.opt_int_param(customer_id, "customer_id")\n self.facility_id = check.opt_int_param(facility_id, "facility_id")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Tplcentral", name)
\n\n\n
[docs]class ClickhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): The host endpoint of the Clickhouse cluster.\n port (int): The port of the database.\n database (str): The name of the database.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class FreshserviceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str, start_date: str):\n """Airbyte Source for Freshservice.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The name of your Freshservice domain\n api_key (str): Freshservice API Key. See here. The key is case sensitive.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Freshservice", name)
\n\n\n
[docs]class ZenloopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n date_from: Optional[str] = None,\n survey_id: Optional[str] = None,\n survey_group_id: Optional[str] = None,\n ):\n """Airbyte Source for Zenloop.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop\n\n Args:\n name (str): The name of the destination.\n api_token (str): Zenloop API Token. You can get the API token in settings page here\n date_from (Optional[str]): Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced\n survey_id (Optional[str]): Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys\n survey_group_id (Optional[str]): Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.date_from = check.opt_str_param(date_from, "date_from")\n self.survey_id = check.opt_str_param(survey_id, "survey_id")\n self.survey_group_id = check.opt_str_param(survey_group_id, "survey_group_id")\n super().__init__("Zenloop", name)
\n\n\n
[docs]class OracleSource(GeneratedAirbyteSource):\n
[docs] class ServiceName:\n
[docs] @public\n def __init__(self, service_name: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.service_name = check.str_param(service_name, "service_name")
\n\n
[docs] class SystemIDSID:\n
[docs] @public\n def __init__(self, sid: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.sid = check.str_param(sid, "sid")
\n\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n connection_data: Union["OracleSource.ServiceName", "OracleSource.SystemIDSID"],\n username: str,\n encryption: Union[\n "OracleSource.Unencrypted",\n "OracleSource.NativeNetworkEncryptionNNE",\n "OracleSource.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL\n connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]): Connect data that will be used for DB connection\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]): The encryption method with is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.connection_data = check.inst_param(\n connection_data, "connection_data", (OracleSource.ServiceName, OracleSource.SystemIDSID)\n )\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleSource.Unencrypted,\n OracleSource.NativeNetworkEncryptionNNE,\n OracleSource.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class KlaviyoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Klaviyo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo\n\n Args:\n name (str): The name of the destination.\n api_key (str): Klaviyo API Key. See our docs if you need help finding this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Klaviyo", name)
\n\n\n
[docs]class GoogleDirectorySource(GeneratedAirbyteSource):\n
[docs] class SignInViaGoogleOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n credentials_title: Optional[str] = None,\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKey:\n
[docs] @public\n def __init__(\n self, credentials_json: str, email: str, credentials_title: Optional[str] = None\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleDirectorySource.SignInViaGoogleOAuth", "GoogleDirectorySource.ServiceAccountKey"\n ],\n ):\n """Airbyte Source for Google Directory.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey]): Google APIs use the OAuth 2.0 protocol for authentication and authorization. The Source supports Web server application and Service accounts scenarios.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey),\n )\n super().__init__("Google Directory", name)
\n\n\n
[docs]class InstagramSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Instagram.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n access_token (str): The value of the access token generated. See the docs for more information\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Instagram", name)
\n\n\n
[docs]class ShortioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_id: str, secret_key: str, start_date: str):\n """Airbyte Source for Shortio.\n\n Documentation can be found at https://developers.short.io/reference\n\n Args:\n name (str): The name of the destination.\n secret_key (str): Short.io Secret Key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_id = check.str_param(domain_id, "domain_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shortio", name)
\n\n\n
[docs]class SquareSource(GeneratedAirbyteSource):\n
[docs] class OauthAuthentication:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Oauth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.auth_type = "Apikey"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n is_sandbox: bool,\n credentials: Union["SquareSource.OauthAuthentication", "SquareSource.APIKey"],\n start_date: Optional[str] = None,\n include_deleted_objects: Optional[bool] = None,\n ):\n """Airbyte Source for Square.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/square\n\n Args:\n name (str): The name of the destination.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n start_date (Optional[str]): UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.\n include_deleted_objects (Optional[bool]): In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)\n """\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.include_deleted_objects = check.opt_bool_param(\n include_deleted_objects, "include_deleted_objects"\n )\n self.credentials = check.inst_param(\n credentials, "credentials", (SquareSource.OauthAuthentication, SquareSource.APIKey)\n )\n super().__init__("Square", name)
\n\n\n
[docs]class DelightedSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, since: str, api_key: str):\n """Airbyte Source for Delighted.\n\n Args:\n name (str): The name of the destination.\n since (str): The date from which you'd like to replicate the data\n api_key (str): A Delighted API key.\n """\n self.since = check.str_param(since, "since")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Delighted", name)
\n\n\n
[docs]class AmazonSqsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n delete_messages: bool,\n max_batch_size: Optional[int] = None,\n max_wait_time: Optional[int] = None,\n attributes_to_return: Optional[str] = None,\n visibility_timeout: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n ):\n """Airbyte Source for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n delete_messages (bool): If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.\n max_batch_size (Optional[int]): Max amount of messages to get in one batch (10 max)\n max_wait_time (Optional[int]): Max amount of time in seconds to wait for messages in a single poll (20 max)\n attributes_to_return (Optional[str]): Comma separated list of Mesage Attribute names to return\n visibility_timeout (Optional[int]): Modify the Visibility Timeout of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for pulling messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for pulling messages\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.delete_messages = check.bool_param(delete_messages, "delete_messages")\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n self.max_wait_time = check.opt_int_param(max_wait_time, "max_wait_time")\n self.attributes_to_return = check.opt_str_param(\n attributes_to_return, "attributes_to_return"\n )\n self.visibility_timeout = check.opt_int_param(visibility_timeout, "visibility_timeout")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class YoutubeAnalyticsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaOAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(self, name: str, credentials: "YoutubeAnalyticsSource.AuthenticateViaOAuth20"):\n """Airbyte Source for Youtube Analytics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", YoutubeAnalyticsSource.AuthenticateViaOAuth20\n )\n super().__init__("Youtube Analytics", name)
\n\n\n
[docs]class ScaffoldSourcePythonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, fix_me: Optional[str] = None):\n """Airbyte Source for Scaffold Source Python.\n\n Args:\n name (str): The name of the destination.\n fix_me (Optional[str]): describe me\n """\n self.fix_me = check.opt_str_param(fix_me, "fix_me")\n super().__init__("Scaffold Source Python", name)
\n\n\n
[docs]class LookerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n client_id: str,\n client_secret: str,\n run_look_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Looker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/looker\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address\n client_id (str): The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.\n client_secret (str): The Client Secret is second part of an API3 key.\n run_look_ids (Optional[List[str]]): The IDs of any Looks to run\n """\n self.domain = check.str_param(domain, "domain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.run_look_ids = check.opt_nullable_list_param(run_look_ids, "run_look_ids", str)\n super().__init__("Looker", name)
\n\n\n
[docs]class GitlabSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_url: str,\n private_token: str,\n start_date: str,\n groups: Optional[str] = None,\n projects: Optional[str] = None,\n ):\n """Airbyte Source for Gitlab.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab\n\n Args:\n name (str): The name of the destination.\n api_url (str): Please enter your basic URL from GitLab instance.\n private_token (str): Log into your GitLab account and then generate a personal Access Token.\n groups (Optional[str]): Space-delimited list of groups. e.g. airbyte.io.\n projects (Optional[str]): Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.\n start_date (str): The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_url = check.str_param(api_url, "api_url")\n self.private_token = check.str_param(private_token, "private_token")\n self.groups = check.opt_str_param(groups, "groups")\n self.projects = check.opt_str_param(projects, "projects")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gitlab", name)
\n\n\n
[docs]class ExchangeRatesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n access_key: str,\n base: Optional[str] = None,\n ignore_weekends: Optional[bool] = None,\n ):\n """Airbyte Source for Exchange Rates.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start getting data from that date.\n access_key (str): Your API Key. See here. The key is case sensitive.\n base (Optional[str]): ISO reference currency. See here. Free plan doesn't support Source Currency Switching, default base currency is EUR\n ignore_weekends (Optional[bool]): Ignore weekends? (Exchanges don't run on weekends)\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_key = check.str_param(access_key, "access_key")\n self.base = check.opt_str_param(base, "base")\n self.ignore_weekends = check.opt_bool_param(ignore_weekends, "ignore_weekends")\n super().__init__("Exchange Rates", name)
\n\n\n
[docs]class AmazonAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n region: Optional[str] = None,\n report_wait_timeout: Optional[int] = None,\n report_generation_max_retries: Optional[int] = None,\n start_date: Optional[str] = None,\n profiles: Optional[List[int]] = None,\n state_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Amazon Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads\n\n Args:\n name (str): The name of the destination.\n client_id (str): The client ID of your Amazon Ads developer application. See the docs for more information.\n client_secret (str): The client secret of your Amazon Ads developer application. See the docs for more information.\n refresh_token (str): Amazon Ads refresh token. See the docs for more information on how to obtain this token.\n region (Optional[str]): Region to pull data from (EU/NA/FE). See docs for more details.\n report_wait_timeout (Optional[int]): Timeout duration in minutes for Reports. Default is 60 minutes.\n report_generation_max_retries (Optional[int]): Maximum retries Airbyte will attempt for fetching report data. Default is 5.\n start_date (Optional[str]): The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format\n profiles (Optional[List[int]]): Profile IDs you want to fetch data for. See docs for more details.\n state_filter (Optional[List[str]]): Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.region = check.opt_str_param(region, "region")\n self.report_wait_timeout = check.opt_int_param(report_wait_timeout, "report_wait_timeout")\n self.report_generation_max_retries = check.opt_int_param(\n report_generation_max_retries, "report_generation_max_retries"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.profiles = check.opt_nullable_list_param(profiles, "profiles", int)\n self.state_filter = check.opt_nullable_list_param(state_filter, "state_filter", str)\n super().__init__("Amazon Ads", name)
\n\n\n
[docs]class MixpanelSource(GeneratedAirbyteSource):\n
[docs] class ServiceAccount:\n
[docs] @public\n def __init__(self, username: str, secret: str):\n self.username = check.str_param(username, "username")\n self.secret = check.str_param(secret, "secret")
\n\n
[docs] class ProjectSecret:\n
[docs] @public\n def __init__(self, api_secret: str):\n self.api_secret = check.str_param(api_secret, "api_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["MixpanelSource.ServiceAccount", "MixpanelSource.ProjectSecret"],\n project_id: Optional[int] = None,\n attribution_window: Optional[int] = None,\n project_timezone: Optional[str] = None,\n select_properties_by_default: Optional[bool] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n region: Optional[str] = None,\n date_window_size: Optional[int] = None,\n ):\n """Airbyte Source for Mixpanel.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel\n\n Args:\n name (str): The name of the destination.\n credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]): Choose how to authenticate to Mixpanel\n project_id (Optional[int]): Your project ID number. See the docs for more information on how to obtain this.\n attribution_window (Optional[int]): A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.\n project_timezone (Optional[str]): Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.\n select_properties_by_default (Optional[bool]): Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.\n start_date (Optional[str]): The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.\n end_date (Optional[str]): The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date\n region (Optional[str]): The region of mixpanel domain instance either US or EU.\n date_window_size (Optional[int]): Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret),\n )\n self.project_id = check.opt_int_param(project_id, "project_id")\n self.attribution_window = check.opt_int_param(attribution_window, "attribution_window")\n self.project_timezone = check.opt_str_param(project_timezone, "project_timezone")\n self.select_properties_by_default = check.opt_bool_param(\n select_properties_by_default, "select_properties_by_default"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.region = check.opt_str_param(region, "region")\n self.date_window_size = check.opt_int_param(date_window_size, "date_window_size")\n super().__init__("Mixpanel", name)
\n\n\n
[docs]class OrbitSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, workspace: str, start_date: Optional[str] = None):\n """Airbyte Source for Orbit.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit\n\n Args:\n name (str): The name of the destination.\n api_token (str): Authorizes you to work with Orbit workspaces associated with the token.\n workspace (str): The unique name of the workspace that your API token is associated with.\n start_date (Optional[str]): Date in the format 2022-06-26. Only load members whose last activities are after this date.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.workspace = check.str_param(workspace, "workspace")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Orbit", name)
\n\n\n
[docs]class AmazonSellerPartnerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lwa_app_id: str,\n lwa_client_secret: str,\n refresh_token: str,\n aws_access_key: str,\n aws_secret_key: str,\n role_arn: str,\n replication_start_date: str,\n aws_environment: str,\n region: str,\n app_id: Optional[str] = None,\n auth_type: Optional[str] = None,\n replication_end_date: Optional[str] = None,\n period_in_days: Optional[int] = None,\n report_options: Optional[str] = None,\n max_wait_seconds: Optional[int] = None,\n ):\n """Airbyte Source for Amazon Seller Partner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner\n\n Args:\n name (str): The name of the destination.\n app_id (Optional[str]): Your Amazon App ID\n lwa_app_id (str): Your Login with Amazon Client ID.\n lwa_client_secret (str): Your Login with Amazon Client Secret.\n refresh_token (str): The Refresh Token obtained via OAuth flow authorization.\n aws_access_key (str): Specifies the AWS access key used as part of the credentials to authenticate the user.\n aws_secret_key (str): Specifies the AWS secret key used as part of the credentials to authenticate the user.\n role_arn (str): Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to 'Assume Role' STS).\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n replication_end_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.\n period_in_days (Optional[int]): Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.\n report_options (Optional[str]): Additional information passed to reports. This varies by report type. Must be a valid json string.\n max_wait_seconds (Optional[int]): Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.\n aws_environment (str): An enumeration.\n region (str): An enumeration.\n """\n self.app_id = check.opt_str_param(app_id, "app_id")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.lwa_app_id = check.str_param(lwa_app_id, "lwa_app_id")\n self.lwa_client_secret = check.str_param(lwa_client_secret, "lwa_client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.aws_access_key = check.str_param(aws_access_key, "aws_access_key")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.role_arn = check.str_param(role_arn, "role_arn")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.replication_end_date = check.opt_str_param(\n replication_end_date, "replication_end_date"\n )\n self.period_in_days = check.opt_int_param(period_in_days, "period_in_days")\n self.report_options = check.opt_str_param(report_options, "report_options")\n self.max_wait_seconds = check.opt_int_param(max_wait_seconds, "max_wait_seconds")\n self.aws_environment = check.str_param(aws_environment, "aws_environment")\n self.region = check.str_param(region, "region")\n super().__init__("Amazon Seller Partner", name)
\n\n\n
[docs]class CourierSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Courier.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/courier\n\n Args:\n name (str): The name of the destination.\n api_key (str): Courier API Key to retrieve your data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Courier", name)
\n\n\n
[docs]class CloseComSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: Optional[str] = None):\n r"""Airbyte Source for Close Com.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com\n\n Args:\n name (str): The name of the destination.\n api_key (str): Close.com API key (usually starts with 'api\\\\_'; find yours here).\n start_date (Optional[str]): The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Close Com", name)
\n\n\n
[docs]class BingAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n refresh_token: str,\n developer_token: str,\n reports_start_date: str,\n auth_method: Optional[str] = None,\n tenant_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n """Airbyte Source for Bing Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads\n\n Args:\n name (str): The name of the destination.\n tenant_id (Optional[str]): The Tenant ID of your Microsoft Advertising developer application. Set this to "common" unless you know you need a different value.\n client_id (str): The Client ID of your Microsoft Advertising developer application.\n client_secret (Optional[str]): The Client Secret of your Microsoft Advertising developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n developer_token (str): Developer token associated with user. See more info in the docs.\n reports_start_date (str): The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.\n """\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.tenant_id = check.opt_str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.reports_start_date = check.str_param(reports_start_date, "reports_start_date")\n super().__init__("Bing Ads", name)
\n\n\n
[docs]class PrimetricSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, client_id: str, client_secret: str):\n """Airbyte Source for Primetric.\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Primetric developer application. The Client ID is visible here.\n client_secret (str): The Client Secret of your Primetric developer application. You can manage your client's credentials here.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Primetric", name)
\n\n\n
[docs]class PivotalTrackerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Pivotal Tracker.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Pivotal Tracker API token\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Pivotal Tracker", name)
\n\n\n
[docs]class ElasticsearchSource(GeneratedAirbyteSource):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchSource.None_",\n "ElasticsearchSource.ApiKeySecret",\n "ElasticsearchSource.UsernamePassword",\n ],\n ):\n r"""Airbyte Source for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n authenticationMethod (Union[ElasticsearchSource.None\\\\_, ElasticsearchSource.ApiKeySecret, ElasticsearchSource.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchSource.None_,\n ElasticsearchSource.ApiKeySecret,\n ElasticsearchSource.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class BigquerySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, credentials_json: str, dataset_id: Optional[str] = None\n ):\n """Airbyte Source for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n dataset_id (Optional[str]): The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.\n credentials_json (str): The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.opt_str_param(dataset_id, "dataset_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Bigquery", name)
\n\n\n
[docs]class WoocommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n start_date: str,\n api_key: str,\n api_secret: str,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Woocommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of the store. For https://EXAMPLE.com, the shop name is 'EXAMPLE.com'.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n api_key (str): The CUSTOMER KEY for API in WooCommerce shop.\n api_secret (str): The CUSTOMER SECRET for API in WooCommerce shop.\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.\n """\n self.shop = check.str_param(shop, "shop")\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Woocommerce", name)
\n\n\n
[docs]class SearchMetricsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_key: str, client_secret: str, country_code: str, start_date: str\n ):\n """Airbyte Source for Search Metrics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics\n\n Args:\n name (str): The name of the destination.\n country_code (str): The region of the S3 staging bucket to use if utilising a copy strategy.\n start_date (str): Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.country_code = check.str_param(country_code, "country_code")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Search Metrics", name)
\n\n\n
[docs]class TypeformSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, start_date: str, token: str, form_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Typeform.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.\n token (str): The API Token for a Typeform account.\n form_ids (Optional[List[str]]): When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL "https://mysite.typeform.com/to/u6nXL7" the form_id is u6nXL7. You can find form URLs on Share panel\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.token = check.str_param(token, "token")\n self.form_ids = check.opt_nullable_list_param(form_ids, "form_ids", str)\n super().__init__("Typeform", name)
\n\n\n
[docs]class WebflowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, site_id: str, api_key: str):\n """Airbyte Source for Webflow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow\n\n Args:\n name (str): The name of the destination.\n site_id (str): The id of the Webflow site you are requesting data from. See https://developers.webflow.com/#sites\n api_key (str): The API token for authenticating to Webflow. See https://university.webflow.com/lesson/intro-to-the-webflow-api\n """\n self.site_id = check.str_param(site_id, "site_id")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Webflow", name)
\n\n\n
[docs]class FireboltSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Source for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n super().__init__("Firebolt", name)
\n\n\n
[docs]class FaunaSource(GeneratedAirbyteSource):\n
[docs] class Disabled:\n
[docs] @public\n def __init__(\n self,\n ):\n self.deletion_mode = "ignore"
\n\n
[docs] class Enabled:\n
[docs] @public\n def __init__(self, column: str):\n self.deletion_mode = "deleted_field"\n self.column = check.str_param(column, "column")
\n\n
[docs] class Collection:\n
[docs] @public\n def __init__(\n self, page_size: int, deletions: Union["FaunaSource.Disabled", "FaunaSource.Enabled"]\n ):\n self.page_size = check.int_param(page_size, "page_size")\n self.deletions = check.inst_param(\n deletions, "deletions", (FaunaSource.Disabled, FaunaSource.Enabled)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n port: int,\n scheme: str,\n secret: str,\n collection: "FaunaSource.Collection",\n ):\n """Airbyte Source for Fauna.\n\n Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain of Fauna to query. Defaults db.fauna.com. See the docs.\n port (int): Endpoint port.\n scheme (str): URL scheme.\n secret (str): Fauna secret, used when authenticating with the database.\n collection (FaunaSource.Collection): Settings for the Fauna Collection.\n """\n self.domain = check.str_param(domain, "domain")\n self.port = check.int_param(port, "port")\n self.scheme = check.str_param(scheme, "scheme")\n self.secret = check.str_param(secret, "secret")\n self.collection = check.inst_param(collection, "collection", FaunaSource.Collection)\n super().__init__("Fauna", name)
\n\n\n
[docs]class IntercomSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Intercom.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n access_token (str): Access token for making authenticated requests. See the Intercom docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Intercom", name)
\n\n\n
[docs]class FreshsalesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str):\n """Airbyte Source for Freshsales.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The Name of your Freshsales domain\n api_key (str): Freshsales API Key. See here. The key is case sensitive.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Freshsales", name)
\n\n\n
[docs]class AdjustSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n dimensions: List[str],\n ingest_start: str,\n metrics: List[str],\n additional_metrics: Optional[List[str]] = None,\n until_today: Optional[bool] = None,\n ):\n """Airbyte Source for Adjust.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust\n\n Args:\n name (str): The name of the destination.\n additional_metrics (Optional[List[str]]): Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.\n api_token (str): Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication\n dimensions (List[str]): Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.\n ingest_start (str): Data ingest start date.\n metrics (List[str]): Select at least one metric to query.\n until_today (Optional[bool]): Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.\n """\n self.additional_metrics = check.opt_nullable_list_param(\n additional_metrics, "additional_metrics", str\n )\n self.api_token = check.str_param(api_token, "api_token")\n self.dimensions = check.list_param(dimensions, "dimensions", str)\n self.ingest_start = check.str_param(ingest_start, "ingest_start")\n self.metrics = check.list_param(metrics, "metrics", str)\n self.until_today = check.opt_bool_param(until_today, "until_today")\n super().__init__("Adjust", name)
\n\n\n
[docs]class BambooHrSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n api_key: str,\n custom_reports_fields: Optional[str] = None,\n custom_reports_include_default_fields: Optional[bool] = None,\n ):\n """Airbyte Source for Bamboo Hr.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr\n\n Args:\n name (str): The name of the destination.\n subdomain (str): Sub Domain of bamboo hr\n api_key (str): Api key of bamboo hr\n custom_reports_fields (Optional[str]): Comma-separated list of fields to include in custom reports.\n custom_reports_include_default_fields (Optional[bool]): If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.api_key = check.str_param(api_key, "api_key")\n self.custom_reports_fields = check.opt_str_param(\n custom_reports_fields, "custom_reports_fields"\n )\n self.custom_reports_include_default_fields = check.opt_bool_param(\n custom_reports_include_default_fields, "custom_reports_include_default_fields"\n )\n super().__init__("Bamboo Hr", name)
\n\n\n
[docs]class GoogleAdsSource(GeneratedAirbyteSource):\n
[docs] class GoogleCredentials:\n
[docs] @public\n def __init__(\n self,\n developer_token: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class CustomGAQLQueriesEntry:\n
[docs] @public\n def __init__(self, query: str, table_name: str):\n self.query = check.str_param(query, "query")\n self.table_name = check.str_param(table_name, "table_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "GoogleAdsSource.GoogleCredentials",\n customer_id: str,\n start_date: str,\n end_date: Optional[str] = None,\n custom_queries: Optional[List[CustomGAQLQueriesEntry]] = None,\n login_customer_id: Optional[str] = None,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads\n\n Args:\n name (str): The name of the destination.\n customer_id (str): Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n login_customer_id (Optional[str]): If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google's documentation.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleAdsSource.GoogleCredentials\n )\n self.customer_id = check.str_param(customer_id, "customer_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.custom_queries = check.opt_nullable_list_param(\n custom_queries, "custom_queries", GoogleAdsSource.CustomGAQLQueriesEntry\n )\n self.login_customer_id = check.opt_str_param(login_customer_id, "login_customer_id")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Google Ads", name)
\n\n\n
[docs]class HellobatonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, company: str):\n """Airbyte Source for Hellobaton.\n\n Args:\n name (str): The name of the destination.\n api_key (str): authentication key required to access the api endpoints\n company (str): Company name that generates your base api url\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.company = check.str_param(company, "company")\n super().__init__("Hellobaton", name)
\n\n\n
[docs]class SendgridSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, apikey: str, start_time: Union[int, str]):\n """Airbyte Source for Sendgrid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid\n\n Args:\n name (str): The name of the destination.\n apikey (str): API Key, use admin to generate this key.\n start_time (Union[int, str]): Start time in ISO8601 format. Any data before this time point will not be replicated.\n """\n self.apikey = check.str_param(apikey, "apikey")\n self.start_time = check.inst_param(start_time, "start_time", (int, str))\n super().__init__("Sendgrid", name)
\n\n\n
[docs]class MondaySource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n subdomain: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MondaySource.OAuth20", "MondaySource.APIToken"]\n ):\n """Airbyte Source for Monday.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/monday\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MondaySource.OAuth20, MondaySource.APIToken)\n )\n super().__init__("Monday", name)
\n\n\n
[docs]class DixaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_token: str, start_date: str, batch_size: Optional[int] = None\n ):\n """Airbyte Source for Dixa.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa\n\n Args:\n name (str): The name of the destination.\n api_token (str): Dixa API token\n start_date (str): The connector pulls records updated from this date onwards.\n batch_size (Optional[int]): Number of days to batch into one request. Max 31.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n super().__init__("Dixa", name)
\n\n\n
[docs]class SalesforceSource(GeneratedAirbyteSource):\n
[docs] class FilterSalesforceObjectsEntry:\n
[docs] @public\n def __init__(self, criteria: str, value: str):\n self.criteria = check.str_param(criteria, "criteria")\n self.value = check.str_param(value, "value")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n is_sandbox: Optional[bool] = None,\n auth_type: Optional[str] = None,\n start_date: Optional[str] = None,\n streams_criteria: Optional[List[FilterSalesforceObjectsEntry]] = None,\n ):\n """Airbyte Source for Salesforce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce\n\n Args:\n name (str): The name of the destination.\n is_sandbox (Optional[bool]): Toggle if you're using a Salesforce Sandbox\n client_id (str): Enter your Salesforce developer application's Client ID\n client_secret (str): Enter your Salesforce developer application's Client secret\n refresh_token (str): Enter your application's Salesforce Refresh Token used for Airbyte to access your Salesforce account.\n start_date (Optional[str]): Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.\n streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]): Filter streams relevant to you\n """\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.streams_criteria = check.opt_nullable_list_param(\n streams_criteria, "streams_criteria", SalesforceSource.FilterSalesforceObjectsEntry\n )\n super().__init__("Salesforce", name)
\n\n\n
[docs]class PipedriveSource(GeneratedAirbyteSource):\n
[docs] class SignInViaPipedriveOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKeyAuthentication:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "Token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n authorization: Union[\n "PipedriveSource.SignInViaPipedriveOAuth", "PipedriveSource.APIKeyAuthentication"\n ],\n replication_start_date: str,\n ):\n """Airbyte Source for Pipedrive.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive\n\n Args:\n name (str): The name of the destination.\n authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]): Choose one of the possible authorization method\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental\n """\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication),\n )\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n super().__init__("Pipedrive", name)
\n\n\n
[docs]class FileSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class LocalFilesystemLimited:\n
[docs] @public\n def __init__(\n self,\n ):\n self.storage = "local"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSource.HTTPSPublicWeb",\n "FileSource.GCSGoogleCloudStorage",\n "FileSource.S3AmazonWebServices",\n "FileSource.AzBlobAzureBlobStorage",\n "FileSource.SSHSecureShell",\n "FileSource.SCPSecureCopyProtocol",\n "FileSource.SFTPSecureFileTransferProtocol",\n "FileSource.LocalFilesystemLimited",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSource.HTTPSPublicWeb, FileSource.GCSGoogleCloudStorage, FileSource.S3AmazonWebServices, FileSource.AzBlobAzureBlobStorage, FileSource.SSHSecureShell, FileSource.SCPSecureCopyProtocol, FileSource.SFTPSecureFileTransferProtocol, FileSource.LocalFilesystemLimited]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSource.HTTPSPublicWeb,\n FileSource.GCSGoogleCloudStorage,\n FileSource.S3AmazonWebServices,\n FileSource.AzBlobAzureBlobStorage,\n FileSource.SSHSecureShell,\n FileSource.SCPSecureCopyProtocol,\n FileSource.SFTPSecureFileTransferProtocol,\n FileSource.LocalFilesystemLimited,\n ),\n )\n super().__init__("File", name)
\n\n\n
[docs]class GlassfrogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Glassfrog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog\n\n Args:\n name (str): The name of the destination.\n api_key (str): API key provided by Glassfrog\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Glassfrog", name)
\n\n\n
[docs]class ChartmogulSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str, interval: str):\n """Airbyte Source for Chartmogul.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chartmogul API key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.\n interval (str): Some APIs such as Metrics require intervals to cluster data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.interval = check.str_param(interval, "interval")\n super().__init__("Chartmogul", name)
\n\n\n
[docs]class OrbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n start_date: Optional[str] = None,\n lookback_window_days: Optional[int] = None,\n string_event_properties_keys: Optional[List[str]] = None,\n numeric_event_properties_keys: Optional[List[str]] = None,\n ):\n """Airbyte Source for Orb.\n\n Documentation can be found at https://docs.withorb.com/\n\n Args:\n name (str): The name of the destination.\n api_key (str): Orb API Key, issued from the Orb admin console.\n start_date (Optional[str]): UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.\n lookback_window_days (Optional[int]): When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.\n string_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n numeric_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.string_event_properties_keys = check.opt_nullable_list_param(\n string_event_properties_keys, "string_event_properties_keys", str\n )\n self.numeric_event_properties_keys = check.opt_nullable_list_param(\n numeric_event_properties_keys, "numeric_event_properties_keys", str\n )\n super().__init__("Orb", name)
\n\n\n
[docs]class CockroachdbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Cockroachdb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt client/server communications for increased security.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Cockroachdb", name)
\n\n\n
[docs]class ConfluenceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, domain_name: str, email: str):\n """Airbyte Source for Confluence.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Please follow the Jira confluence for generating an API token: https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/\n domain_name (str): Your Confluence domain name\n email (str): Your Confluence login email\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.email = check.str_param(email, "email")\n super().__init__("Confluence", name)
\n\n\n
[docs]class PlaidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n api_key: str,\n client_id: str,\n plaid_env: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Plaid.\n\n Documentation can be found at https://plaid.com/docs/api/\n\n Args:\n name (str): The name of the destination.\n access_token (str): The end-user's Link access token.\n api_key (str): The Plaid API key to use to hit the API.\n client_id (str): The Plaid client id\n plaid_env (str): The Plaid environment\n start_date (Optional[str]): The date from which you'd like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.api_key = check.str_param(api_key, "api_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.plaid_env = check.str_param(plaid_env, "plaid_env")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Plaid", name)
\n\n\n
[docs]class SnapchatMarketingSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Snapchat Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Snapchat developer application.\n client_secret (str): The Client Secret of your Snapchat developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n start_date (Optional[str]): Date in the format 2022-01-01. Any data before this date will not be replicated.\n end_date (Optional[str]): Date in the format 2017-01-25. Any data after this date will not be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Snapchat Marketing", name)
\n\n\n
[docs]class MicrosoftTeamsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaMicrosoftOAuth20:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateViaMicrosoft:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n period: str,\n credentials: Union[\n "MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20",\n "MicrosoftTeamsSource.AuthenticateViaMicrosoft",\n ],\n ):\n """Airbyte Source for Microsoft Teams.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams\n\n Args:\n name (str): The name of the destination.\n period (str): Specifies the length of time over which the Team Device Report stream is aggregated. The supported values are: D7, D30, D90, and D180.\n credentials (Union[MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20, MicrosoftTeamsSource.AuthenticateViaMicrosoft]): Choose how to authenticate to Microsoft\n """\n self.period = check.str_param(period, "period")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20,\n MicrosoftTeamsSource.AuthenticateViaMicrosoft,\n ),\n )\n super().__init__("Microsoft Teams", name)
\n\n\n
[docs]class LeverHiringSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "LeverHiringSource.OAuthCredentials",\n start_date: str,\n environment: Optional[str] = None,\n ):\n """Airbyte Source for Lever Hiring.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring\n\n Args:\n name (str): The name of the destination.\n credentials (LeverHiringSource.OAuthCredentials): Choose how to authenticate to Lever Hiring.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.\n environment (Optional[str]): The environment in which you'd like to replicate data for Lever. This is used to determine which Lever API endpoint to use.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", LeverHiringSource.OAuthCredentials\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.environment = check.opt_str_param(environment, "environment")\n super().__init__("Lever Hiring", name)
\n\n\n
[docs]class TwilioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_sid: str,\n auth_token: str,\n start_date: str,\n lookback_window: Optional[int] = None,\n ):\n """Airbyte Source for Twilio.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio\n\n Args:\n name (str): The name of the destination.\n account_sid (str): Twilio account SID\n auth_token (str): Twilio Auth Token.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (Optional[int]): How far into the past to look for records. (in minutes)\n """\n self.account_sid = check.str_param(account_sid, "account_sid")\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.opt_int_param(lookback_window, "lookback_window")\n super().__init__("Twilio", name)
\n\n\n
[docs]class StripeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n client_secret: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n slice_range: Optional[int] = None,\n ):\n r"""Airbyte Source for Stripe.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe\n\n Args:\n name (str): The name of the destination.\n account_id (str): Your Stripe account ID (starts with 'acct\\\\_', find yours here).\n client_secret (str): Stripe API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here\n slice_range (Optional[int]): The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.slice_range = check.opt_int_param(slice_range, "slice_range")\n super().__init__("Stripe", name)
\n\n\n
[docs]class Db2Source(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str, key_store_password: Optional[str] = None):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")\n self.key_store_password = check.opt_str_param(key_store_password, "key_store_password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n db: str,\n username: str,\n password: str,\n encryption: Union["Db2Source.Unencrypted", "Db2Source.TLSEncryptedVerifyCertificate"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Db2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/db2\n\n Args:\n name (str): The name of the destination.\n host (str): Host of the Db2.\n port (int): Port of the database.\n db (str): Name of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]): Encryption method to use when communicating with the database\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.db = check.str_param(db, "db")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate),\n )\n super().__init__("Db2", name)
\n\n\n
[docs]class SlackSource(GeneratedAirbyteSource):\n
[docs] class DefaultOAuth20Authorization:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: Optional[str] = None,\n ):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class APITokenCredentials:\n
[docs] @public\n def __init__(self, api_token: str):\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n lookback_window: int,\n join_channels: bool,\n credentials: Union[\n "SlackSource.DefaultOAuth20Authorization", "SlackSource.APITokenCredentials"\n ],\n channel_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Slack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/slack\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (int): How far into the past to look for messages in threads.\n join_channels (bool): Whether to join all channels or to sync data only from channels the bot is already in. If false, you'll need to manually add the bot to all the channels from which you'd like to sync messages.\n channel_filter (Optional[List[str]]): A channel name list (without leading '#' char) which limit the channels from which you'd like to sync. Empty list means no filter.\n credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]): Choose how to authenticate into Slack\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.int_param(lookback_window, "lookback_window")\n self.join_channels = check.bool_param(join_channels, "join_channels")\n self.channel_filter = check.opt_nullable_list_param(channel_filter, "channel_filter", str)\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials),\n )\n super().__init__("Slack", name)
\n\n\n
[docs]class RechargeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Recharge.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.\n access_token (str): The value of the Access Token generated. See the docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Recharge", name)
\n\n\n
[docs]class OpenweatherSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lat: str,\n lon: str,\n appid: str,\n units: Optional[str] = None,\n lang: Optional[str] = None,\n ):\n """Airbyte Source for Openweather.\n\n Args:\n name (str): The name of the destination.\n lat (str): Latitude for which you want to get weather condition from. (min -90, max 90)\n lon (str): Longitude for which you want to get weather condition from. (min -180, max 180)\n appid (str): Your OpenWeather API Key. See here. The key is case sensitive.\n units (Optional[str]): Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.\n lang (Optional[str]): You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.\n """\n self.lat = check.str_param(lat, "lat")\n self.lon = check.str_param(lon, "lon")\n self.appid = check.str_param(appid, "appid")\n self.units = check.opt_str_param(units, "units")\n self.lang = check.opt_str_param(lang, "lang")\n super().__init__("Openweather", name)
\n\n\n
[docs]class RetentlySource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaRetentlyOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithAPIToken:\n
[docs] @public\n def __init__(self, api_key: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "RetentlySource.AuthenticateViaRetentlyOAuth", "RetentlySource.AuthenticateWithAPIToken"\n ],\n ):\n """Airbyte Source for Retently.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken]): Choose how to authenticate to Retently\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken),\n )\n super().__init__("Retently", name)
\n\n\n
[docs]class ScaffoldSourceHttpSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, TODO: str):\n """Airbyte Source for Scaffold Source Http.\n\n Args:\n name (str): The name of the destination.\n TODO (str): describe me\n """\n self.TODO = check.str_param(TODO, "TODO")\n super().__init__("Scaffold Source Http", name)
\n\n\n
[docs]class YandexMetricaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, auth_token: str, counter_id: str, start_date: str, end_date: str):\n """Airbyte Source for Yandex Metrica.\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Your Yandex Metrica API access token\n counter_id (str): Counter ID\n start_date (str): UTC date and time in the format YYYY-MM-DD.\n end_date (str): UTC date and time in the format YYYY-MM-DD.\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.counter_id = check.str_param(counter_id, "counter_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.str_param(end_date, "end_date")\n super().__init__("Yandex Metrica", name)
\n\n\n
[docs]class TalkdeskExploreSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n auth_url: str,\n api_key: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Talkdesk Explore.\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.\n timezone (Optional[str]): Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)\n auth_url (str): Talkdesk Auth URL. Only 'client_credentials' auth type supported at the moment.\n api_key (str): Talkdesk API key.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n self.auth_url = check.str_param(auth_url, "auth_url")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Talkdesk Explore", name)
\n\n\n
[docs]class ChargifySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, domain: str):\n """Airbyte Source for Chargify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chargify API Key.\n domain (str): Chargify domain. Normally this domain follows the following format companyname.chargify.com\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.domain = check.str_param(domain, "domain")\n super().__init__("Chargify", name)
\n\n\n
[docs]class RkiCovidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str):\n """Airbyte Source for Rki Covid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n """\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Rki Covid", name)
\n\n\n
[docs]class PostgresSource(GeneratedAirbyteSource):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n replication_slot: str,\n publication: str,\n plugin: Optional[str] = None,\n initial_waiting_seconds: Optional[int] = None,\n ):\n self.method = "CDC"\n self.plugin = check.opt_str_param(plugin, "plugin")\n self.replication_slot = check.str_param(replication_slot, "replication_slot")\n self.publication = check.str_param(publication, "publication")\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )
\n\n
[docs] class NoTunnel:\n
[docs] @public\n def __init__(\n self,\n ):\n self.tunnel_method = "NO_TUNNEL"
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, tunnel_host: str, tunnel_port: int, tunnel_user: str, ssh_key: str):\n self.tunnel_method = "SSH_KEY_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.ssh_key = check.str_param(ssh_key, "ssh_key")
\n\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(\n self, tunnel_host: str, tunnel_port: int, tunnel_user: str, tunnel_user_password: str\n ):\n self.tunnel_method = "SSH_PASSWORD_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.tunnel_user_password = check.str_param(\n tunnel_user_password, "tunnel_user_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "PostgresSource.Disable",\n "PostgresSource.Allow",\n "PostgresSource.Prefer",\n "PostgresSource.Require",\n "PostgresSource.VerifyCa",\n "PostgresSource.VerifyFull",\n ],\n replication_method: Union[\n "PostgresSource.Standard", "PostgresSource.LogicalReplicationCDC"\n ],\n tunnel_method: Union[\n "PostgresSource.NoTunnel",\n "PostgresSource.SSHKeyAuthentication",\n "PostgresSource.PasswordAuthentication",\n ],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas (case sensitive) to sync from. Defaults to public.\n username (str): Username to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]): SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.\n replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]): Replication method for extracting data from the database.\n tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]): Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresSource.Disable,\n PostgresSource.Allow,\n PostgresSource.Prefer,\n PostgresSource.Require,\n PostgresSource.VerifyCa,\n PostgresSource.VerifyFull,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (PostgresSource.Standard, PostgresSource.LogicalReplicationCDC),\n )\n self.tunnel_method = check.inst_param(\n tunnel_method,\n "tunnel_method",\n (\n PostgresSource.NoTunnel,\n PostgresSource.SSHKeyAuthentication,\n PostgresSource.PasswordAuthentication,\n ),\n )\n super().__init__("Postgres", name)
\n\n\n
[docs]class TrelloSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n board_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Trello.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/trello\n\n Args:\n name (str): The name of the destination.\n token (str): Trello v API token. See the docs for instructions on how to generate it.\n key (str): Trello API key. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n board_ids (Optional[List[str]]): IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.board_ids = check.opt_nullable_list_param(board_ids, "board_ids", str)\n super().__init__("Trello", name)
\n\n\n
[docs]class PrestashopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, url: str, access_key: str):\n """Airbyte Source for Prestashop.\n\n Args:\n name (str): The name of the destination.\n url (str): Shop URL without trailing slash (domain name or IP address)\n access_key (str): Your PrestaShop access key. See the docs for info on how to obtain this.\n """\n self.url = check.str_param(url, "url")\n self.access_key = check.str_param(access_key, "access_key")\n super().__init__("Prestashop", name)
\n\n\n
[docs]class PaystackSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n secret_key: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n ):\n r"""Airbyte Source for Paystack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack\n\n Args:\n name (str): The name of the destination.\n secret_key (str): The Paystack API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.\n """\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n super().__init__("Paystack", name)
\n\n\n
[docs]class S3Source(GeneratedAirbyteSource):\n
[docs] class CSV:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n delimiter: Optional[str] = None,\n infer_datatypes: Optional[bool] = None,\n quote_char: Optional[str] = None,\n escape_char: Optional[str] = None,\n encoding: Optional[str] = None,\n double_quote: Optional[bool] = None,\n newlines_in_values: Optional[bool] = None,\n additional_reader_options: Optional[str] = None,\n advanced_options: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.delimiter = check.opt_str_param(delimiter, "delimiter")\n self.infer_datatypes = check.opt_bool_param(infer_datatypes, "infer_datatypes")\n self.quote_char = check.opt_str_param(quote_char, "quote_char")\n self.escape_char = check.opt_str_param(escape_char, "escape_char")\n self.encoding = check.opt_str_param(encoding, "encoding")\n self.double_quote = check.opt_bool_param(double_quote, "double_quote")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.additional_reader_options = check.opt_str_param(\n additional_reader_options, "additional_reader_options"\n )\n self.advanced_options = check.opt_str_param(advanced_options, "advanced_options")\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class Parquet:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n columns: Optional[List[str]] = None,\n batch_size: Optional[int] = None,\n buffer_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.columns = check.opt_nullable_list_param(columns, "columns", str)\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n self.buffer_size = check.opt_int_param(buffer_size, "buffer_size")
\n\n
[docs] class Avro:\n
[docs] @public\n def __init__(self, filetype: Optional[str] = None):\n self.filetype = check.opt_str_param(filetype, "filetype")
\n\n
[docs] class Jsonl:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n newlines_in_values: Optional[bool] = None,\n unexpected_field_behavior: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.unexpected_field_behavior = check.opt_str_param(\n unexpected_field_behavior, "unexpected_field_behavior"\n )\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n bucket: str,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n path_prefix: Optional[str] = None,\n endpoint: Optional[str] = None,\n ):\n self.bucket = check.str_param(bucket, "bucket")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n self.path_prefix = check.opt_str_param(path_prefix, "path_prefix")\n self.endpoint = check.opt_str_param(endpoint, "endpoint")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset: str,\n path_pattern: str,\n format: Union["S3Source.CSV", "S3Source.Parquet", "S3Source.Avro", "S3Source.Jsonl"],\n provider: "S3Source.S3AmazonWebServices",\n schema: Optional[str] = None,\n ):\n """Airbyte Source for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/s3\n\n Args:\n name (str): The name of the destination.\n dataset (str): The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.\n path_pattern (str): A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.\n format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]): The format of the files you'd like to replicate\n schema (Optional[str]): Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { "column" : "type" }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.\n provider (S3Source.S3AmazonWebServices): Use this to load files from S3 or S3-compatible services\n """\n self.dataset = check.str_param(dataset, "dataset")\n self.path_pattern = check.str_param(path_pattern, "path_pattern")\n self.format = check.inst_param(\n format, "format", (S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl)\n )\n self.schema = check.opt_str_param(schema, "schema")\n self.provider = check.inst_param(provider, "provider", S3Source.S3AmazonWebServices)\n super().__init__("S3", name)
\n\n\n
[docs]class SnowflakeSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.auth_type = "OAuth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.auth_type = "username/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["SnowflakeSource.OAuth20", "SnowflakeSource.UsernameAndPassword"],\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).\n role (str): The role you created for Airbyte to access Snowflake.\n warehouse (str): The warehouse you created for Airbyte to access data.\n database (str): The database you created for Airbyte to access data.\n schema (str): The source Snowflake schema tables.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SnowflakeSource.OAuth20, SnowflakeSource.UsernameAndPassword),\n )\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Snowflake", name)
\n\n\n
[docs]class AmplitudeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, secret_key: str, start_date: str):\n """Airbyte Source for Amplitude.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude\n\n Args:\n name (str): The name of the destination.\n api_key (str): Amplitude API Key. See the setup guide for more information on how to obtain this key.\n secret_key (str): Amplitude Secret Key. See the setup guide for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Amplitude", name)
\n\n\n
[docs]class PosthogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, api_key: str, base_url: Optional[str] = None):\n """Airbyte Source for Posthog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data. Any data before this date will not be replicated.\n api_key (str): API Key. See the docs for information on how to generate this key.\n base_url (Optional[str]): Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.base_url = check.opt_str_param(base_url, "base_url")\n super().__init__("Posthog", name)
\n\n\n
[docs]class PaypalTransactionSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n is_sandbox: bool,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n """Airbyte Source for Paypal Transaction.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions\n\n Args:\n name (str): The name of the destination.\n client_id (Optional[str]): The Client ID of your Paypal developer application.\n client_secret (Optional[str]): The Client Secret of your Paypal developer application.\n refresh_token (Optional[str]): The key to refresh the expired access token.\n start_date (str): Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n super().__init__("Paypal Transaction", name)
\n\n\n
[docs]class MssqlSource(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self, data_to_sync: Optional[str] = None, snapshot_isolation: Optional[str] = None\n ):\n self.method = "CDC"\n self.data_to_sync = check.opt_str_param(data_to_sync, "data_to_sync")\n self.snapshot_isolation = check.opt_str_param(snapshot_isolation, "snapshot_isolation")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_method: Union[\n "MssqlSource.Unencrypted",\n "MssqlSource.EncryptedTrustServerCertificate",\n "MssqlSource.EncryptedVerifyCertificate",\n ],\n replication_method: Union["MssqlSource.Standard", "MssqlSource.LogicalReplicationCDC"],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n database (str): The name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]): The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlSource.Unencrypted,\n MssqlSource.EncryptedTrustServerCertificate,\n MssqlSource.EncryptedVerifyCertificate,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MssqlSource.Standard, MssqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class ZohoCrmSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n dc_region: str,\n environment: str,\n edition: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Zoho Crm.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm\n\n Args:\n name (str): The name of the destination.\n client_id (str): OAuth2.0 Client ID\n client_secret (str): OAuth2.0 Client Secret\n refresh_token (str): OAuth2.0 Refresh Token\n dc_region (str): Please choose the region of your Data Center location. More info by this Link\n environment (str): Please choose the environment\n start_datetime (Optional[str]): ISO 8601, for instance: `YYYY-MM-DD`, `YYYY-MM-DD HH:MM:SS+HH:MM`\n edition (str): Choose your Edition of Zoho CRM to determine API Concurrency Limits\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.dc_region = check.str_param(dc_region, "dc_region")\n self.environment = check.str_param(environment, "environment")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n self.edition = check.str_param(edition, "edition")\n super().__init__("Zoho Crm", name)
\n\n\n
[docs]class RedshiftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: str,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Redshift", name)
\n\n\n
[docs]class AsanaSource(GeneratedAirbyteSource):\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["AsanaSource.PATCredentials", "AsanaSource.OAuthCredentials"],\n ):\n """Airbyte Source for Asana.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[AsanaSource.PATCredentials, AsanaSource.OAuthCredentials]): Choose how to authenticate to Github\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (AsanaSource.PATCredentials, AsanaSource.OAuthCredentials)\n )\n super().__init__("Asana", name)
\n\n\n
[docs]class SmartsheetsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n spreadsheet_id: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Smartsheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets\n\n Args:\n name (str): The name of the destination.\n access_token (str): The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you'd like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.\n spreadsheet_id (str): The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties\n start_datetime (Optional[str]): Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: `2000-01-01T13:00:00`\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n super().__init__("Smartsheets", name)
\n\n\n
[docs]class MailchimpSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, apikey: str):\n self.auth_type = "apikey"\n self.apikey = check.str_param(apikey, "apikey")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MailchimpSource.OAuth20", "MailchimpSource.APIKey"]\n ):\n """Airbyte Source for Mailchimp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MailchimpSource.OAuth20, MailchimpSource.APIKey)\n )\n super().__init__("Mailchimp", name)
\n\n\n
[docs]class SentrySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n auth_token: str,\n organization: str,\n project: str,\n hostname: Optional[str] = None,\n discover_fields: Optional[List[str]] = None,\n ):\n """Airbyte Source for Sentry.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting "{instance_url_prefix}/settings/account/api/auth-tokens/"\n hostname (Optional[str]): Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.\n organization (str): The slug of the organization the groups belong to.\n project (str): The name (slug) of the Project you want to sync.\n discover_fields (Optional[List[str]]): Fields to retrieve when fetching discover events\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.hostname = check.opt_str_param(hostname, "hostname")\n self.organization = check.str_param(organization, "organization")\n self.project = check.str_param(project, "project")\n self.discover_fields = check.opt_nullable_list_param(\n discover_fields, "discover_fields", str\n )\n super().__init__("Sentry", name)
\n\n\n
[docs]class MailgunSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n private_key: str,\n domain_region: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Mailgun.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun\n\n Args:\n name (str): The name of the destination.\n private_key (str): Primary account API key to access your Mailgun data.\n domain_region (Optional[str]): Domain region code. 'EU' or 'US' are possible values. The default is 'US'.\n start_date (Optional[str]): UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.\n """\n self.private_key = check.str_param(private_key, "private_key")\n self.domain_region = check.opt_str_param(domain_region, "domain_region")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Mailgun", name)
\n\n\n
[docs]class OnesignalSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, user_auth_key: str, start_date: str, outcome_names: str):\n """Airbyte Source for Onesignal.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal\n\n Args:\n name (str): The name of the destination.\n user_auth_key (str): OneSignal User Auth Key, see the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n outcome_names (str): Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details\n """\n self.user_auth_key = check.str_param(user_auth_key, "user_auth_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.outcome_names = check.str_param(outcome_names, "outcome_names")\n super().__init__("Onesignal", name)
\n\n\n
[docs]class PythonHttpTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, base: str, access_key: Optional[str] = None):\n """Airbyte Source for Python Http Tutorial.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n access_key (Optional[str]): API access key used to retrieve data from the Exchange Rates API.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n base (str): ISO reference currency. See here.\n """\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.base = check.str_param(base, "base")\n super().__init__("Python Http Tutorial", name)
\n\n\n
[docs]class AirtableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, base_id: str, tables: List[str]):\n """Airbyte Source for Airtable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable\n\n Args:\n name (str): The name of the destination.\n api_key (str): The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.\n base_id (str): The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.\n tables (List[str]): The list of Tables to integrate.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.base_id = check.str_param(base_id, "base_id")\n self.tables = check.list_param(tables, "tables", str)\n super().__init__("Airtable", name)
\n\n\n
[docs]class MongodbV2Source(GeneratedAirbyteSource):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbV2Source.StandaloneMongoDbInstance",\n "MongodbV2Source.ReplicaSet",\n "MongodbV2Source.MongoDBAtlas",\n ],\n database: str,\n user: Optional[str] = None,\n password: Optional[str] = None,\n auth_source: Optional[str] = None,\n ):\n """Airbyte Source for Mongodb V2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]): The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): The database you want to replicate.\n user (Optional[str]): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n auth_source (Optional[str]): The authentication source where the user information is stored.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbV2Source.StandaloneMongoDbInstance,\n MongodbV2Source.ReplicaSet,\n MongodbV2Source.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.user = check.opt_str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.auth_source = check.opt_str_param(auth_source, "auth_source")\n super().__init__("Mongodb V2", name)
\n\n\n
[docs]class FileSecureSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSecureSource.HTTPSPublicWeb",\n "FileSecureSource.GCSGoogleCloudStorage",\n "FileSecureSource.S3AmazonWebServices",\n "FileSecureSource.AzBlobAzureBlobStorage",\n "FileSecureSource.SSHSecureShell",\n "FileSecureSource.SCPSecureCopyProtocol",\n "FileSecureSource.SFTPSecureFileTransferProtocol",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File Secure.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSecureSource.HTTPSPublicWeb, FileSecureSource.GCSGoogleCloudStorage, FileSecureSource.S3AmazonWebServices, FileSecureSource.AzBlobAzureBlobStorage, FileSecureSource.SSHSecureShell, FileSecureSource.SCPSecureCopyProtocol, FileSecureSource.SFTPSecureFileTransferProtocol]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSecureSource.HTTPSPublicWeb,\n FileSecureSource.GCSGoogleCloudStorage,\n FileSecureSource.S3AmazonWebServices,\n FileSecureSource.AzBlobAzureBlobStorage,\n FileSecureSource.SSHSecureShell,\n FileSecureSource.SCPSecureCopyProtocol,\n FileSecureSource.SFTPSecureFileTransferProtocol,\n ),\n )\n super().__init__("File Secure", name)
\n\n\n
[docs]class ZendeskSupportSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n subdomain: str,\n credentials: Union["ZendeskSupportSource.OAuth20", "ZendeskSupportSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Support.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken),\n )\n super().__init__("Zendesk Support", name)
\n\n\n
[docs]class TempoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Tempo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/\n\n Args:\n name (str): The name of the destination.\n api_token (str): Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Tempo", name)
\n\n\n
[docs]class BraintreeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n merchant_id: str,\n public_key: str,\n private_key: str,\n environment: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Braintree.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree\n\n Args:\n name (str): The name of the destination.\n merchant_id (str): The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.\n public_key (str): Braintree Public Key. See the docs for more information on how to obtain this key.\n private_key (str): Braintree Private Key. See the docs for more information on how to obtain this key.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n environment (str): Environment specifies where the data will come from.\n """\n self.merchant_id = check.str_param(merchant_id, "merchant_id")\n self.public_key = check.str_param(public_key, "public_key")\n self.private_key = check.str_param(private_key, "private_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.environment = check.str_param(environment, "environment")\n super().__init__("Braintree", name)
\n\n\n
[docs]class SalesloftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, client_id: str, client_secret: str, refresh_token: str, start_date: str\n ):\n """Airbyte Source for Salesloft.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Salesloft developer application.\n client_secret (str): The Client Secret of your Salesloft developer application.\n refresh_token (str): The token for obtaining a new access token.\n start_date (str): The date from which you'd like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Salesloft", name)
\n\n\n
[docs]class LinnworksSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, application_id: str, application_secret: str, token: str, start_date: str\n ):\n """Airbyte Source for Linnworks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks\n\n Args:\n name (str): The name of the destination.\n application_id (str): Linnworks Application ID\n application_secret (str): Linnworks Application Secret\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.application_id = check.str_param(application_id, "application_id")\n self.application_secret = check.str_param(application_secret, "application_secret")\n self.token = check.str_param(token, "token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Linnworks", name)
\n\n\n
[docs]class ChargebeeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, site: str, site_api_key: str, start_date: str, product_catalog: str\n ):\n """Airbyte Source for Chargebee.\n\n Documentation can be found at https://apidocs.chargebee.com/docs/api\n\n Args:\n name (str): The name of the destination.\n site (str): The site prefix for your Chargebee instance.\n site_api_key (str): Chargebee API Key. See the docs for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n product_catalog (str): Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under `API Version` section.\n """\n self.site = check.str_param(site, "site")\n self.site_api_key = check.str_param(site_api_key, "site_api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.product_catalog = check.str_param(product_catalog, "product_catalog")\n super().__init__("Chargebee", name)
\n\n\n
[docs]class GoogleAnalyticsDataApiSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n property_id: str,\n credentials: Union[\n "GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication",\n ],\n date_ranges_start_date: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics Data Api.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4\n\n Args:\n name (str): The name of the destination.\n property_id (str): A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body\n credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]): Credentials for the service\n date_ranges_start_date (str): The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.property_id = check.str_param(property_id, "property_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth,\n GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.date_ranges_start_date = check.str_param(\n date_ranges_start_date, "date_ranges_start_date"\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics Data Api", name)
\n\n\n
[docs]class OutreachSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n redirect_uri: str,\n start_date: str,\n ):\n """Airbyte Source for Outreach.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Outreach developer application.\n client_secret (str): The Client Secret of your Outreach developer application.\n refresh_token (str): The token for obtaining the new access token.\n redirect_uri (str): A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.\n start_date (str): The date from which you'd like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.redirect_uri = check.str_param(redirect_uri, "redirect_uri")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Outreach", name)
\n\n\n
[docs]class LemlistSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Lemlist.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist\n\n Args:\n name (str): The name of the destination.\n api_key (str): Lemlist API key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Lemlist", name)
\n\n\n
[docs]class ApifyDatasetSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, datasetId: str, clean: Optional[bool] = None):\n """Airbyte Source for Apify Dataset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset\n\n Args:\n name (str): The name of the destination.\n datasetId (str): ID of the dataset you would like to load to Airbyte.\n clean (Optional[bool]): If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.\n """\n self.datasetId = check.str_param(datasetId, "datasetId")\n self.clean = check.opt_bool_param(clean, "clean")\n super().__init__("Apify Dataset", name)
\n\n\n
[docs]class RecurlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n begin_time: Optional[str] = None,\n end_time: Optional[str] = None,\n ):\n """Airbyte Source for Recurly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly\n\n Args:\n name (str): The name of the destination.\n api_key (str): Recurly API Key. See the docs for more information on how to generate this key.\n begin_time (Optional[str]): ISO8601 timestamp from which the replication from Recurly API will start from.\n end_time (Optional[str]): ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won't be imported.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.begin_time = check.opt_str_param(begin_time, "begin_time")\n self.end_time = check.opt_str_param(end_time, "end_time")\n super().__init__("Recurly", name)
\n\n\n
[docs]class ZendeskTalkSource(GeneratedAirbyteSource):\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n credentials: Union["ZendeskTalkSource.APIToken", "ZendeskTalkSource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Zendesk Talk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk\n\n Args:\n name (str): The name of the destination.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n start_date (str): The date from which you'd like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Zendesk Talk", name)
\n\n\n
[docs]class SftpSource(GeneratedAirbyteSource):\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(self, auth_user_password: str):\n self.auth_method = "SSH_PASSWORD_AUTH"\n self.auth_user_password = check.str_param(auth_user_password, "auth_user_password")
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, auth_ssh_key: str):\n self.auth_method = "SSH_KEY_AUTH"\n self.auth_ssh_key = check.str_param(auth_ssh_key, "auth_ssh_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n user: str,\n host: str,\n port: int,\n credentials: Union["SftpSource.PasswordAuthentication", "SftpSource.SSHKeyAuthentication"],\n file_types: Optional[str] = None,\n folder_path: Optional[str] = None,\n file_pattern: Optional[str] = None,\n ):\n """Airbyte Source for Sftp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/sftp\n\n Args:\n name (str): The name of the destination.\n user (str): The server user\n host (str): The server host address\n port (int): The server port\n credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]): The server authentication method\n file_types (Optional[str]): Coma separated file types. Currently only 'csv' and 'json' types are supported.\n folder_path (Optional[str]): The directory to search files for sync\n file_pattern (Optional[str]): The regular expression to specify files for sync in a chosen Folder Path\n """\n self.user = check.str_param(user, "user")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication),\n )\n self.file_types = check.opt_str_param(file_types, "file_types")\n self.folder_path = check.opt_str_param(folder_path, "folder_path")\n self.file_pattern = check.opt_str_param(file_pattern, "file_pattern")\n super().__init__("Sftp", name)
\n\n\n
[docs]class WhiskyHunterSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n ):\n """Airbyte Source for Whisky Hunter.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter\n\n Args:\n name (str): The name of the destination.\n\n """\n super().__init__("Whisky Hunter", name)
\n\n\n
[docs]class FreshdeskSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n requests_per_minute: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Freshdesk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk\n\n Args:\n name (str): The name of the destination.\n domain (str): Freshdesk domain\n api_key (str): Freshdesk API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (Optional[str]): UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Freshdesk", name)
\n\n\n
[docs]class GocardlessSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n gocardless_environment: str,\n gocardless_version: str,\n start_date: str,\n ):\n """Airbyte Source for Gocardless.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless\n\n Args:\n name (str): The name of the destination.\n access_token (str): Gocardless API TOKEN\n gocardless_environment (str): Environment you are trying to connect to.\n gocardless_version (str): GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.gocardless_environment = check.str_param(\n gocardless_environment, "gocardless_environment"\n )\n self.gocardless_version = check.str_param(gocardless_version, "gocardless_version")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gocardless", name)
\n\n\n
[docs]class ZuoraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n tenant_endpoint: str,\n data_query: str,\n client_id: str,\n client_secret: str,\n window_in_days: Optional[str] = None,\n ):\n """Airbyte Source for Zuora.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start Date in format: YYYY-MM-DD\n window_in_days (Optional[str]): The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).\n tenant_endpoint (str): Please choose the right endpoint where your Tenant is located. More info by this Link\n data_query (str): Choose between `Live`, or `Unlimited` - the optimized, replicated database at 12 hours freshness for high volume extraction Link\n client_id (str): Your OAuth user Client ID\n client_secret (str): Your OAuth user Client Secret\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.window_in_days = check.opt_str_param(window_in_days, "window_in_days")\n self.tenant_endpoint = check.str_param(tenant_endpoint, "tenant_endpoint")\n self.data_query = check.str_param(data_query, "data_query")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Zuora", name)
\n\n\n
[docs]class MarketoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, domain_url: str, client_id: str, client_secret: str, start_date: str\n ):\n """Airbyte Source for Marketo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo\n\n Args:\n name (str): The name of the destination.\n domain_url (str): Your Marketo Base URL. See the docs for info on how to obtain this.\n client_id (str): The Client ID of your Marketo developer application. See the docs for info on how to obtain this.\n client_secret (str): The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_url = check.str_param(domain_url, "domain_url")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Marketo", name)
\n\n\n
[docs]class DriftSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: str,\n credentials: Optional[str] = None,\n ):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["DriftSource.OAuth20", "DriftSource.AccessToken"]\n ):\n """Airbyte Source for Drift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/drift\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (DriftSource.OAuth20, DriftSource.AccessToken)\n )\n super().__init__("Drift", name)
\n\n\n
[docs]class PokeapiSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, pokemon_name: str):\n """Airbyte Source for Pokeapi.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi\n\n Args:\n name (str): The name of the destination.\n pokemon_name (str): Pokemon requested from the API.\n """\n self.pokemon_name = check.str_param(pokemon_name, "pokemon_name")\n super().__init__("Pokeapi", name)
\n\n\n
[docs]class NetsuiteSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n realm: str,\n consumer_key: str,\n consumer_secret: str,\n token_key: str,\n token_secret: str,\n start_datetime: str,\n object_types: Optional[List[str]] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Netsuite.\n\n Args:\n name (str): The name of the destination.\n realm (str): Netsuite realm e.g. 2344535, as for `production` or 2344535_SB1, as for the `sandbox`\n consumer_key (str): Consumer key associated with your integration\n consumer_secret (str): Consumer secret associated with your integration\n token_key (str): Access token key\n token_secret (str): Access token secret\n object_types (Optional[List[str]]): The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.\n start_datetime (str): Starting point for your data replication, in format of "YYYY-MM-DDTHH:mm:ssZ"\n window_in_days (Optional[int]): The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.\n """\n self.realm = check.str_param(realm, "realm")\n self.consumer_key = check.str_param(consumer_key, "consumer_key")\n self.consumer_secret = check.str_param(consumer_secret, "consumer_secret")\n self.token_key = check.str_param(token_key, "token_key")\n self.token_secret = check.str_param(token_secret, "token_secret")\n self.object_types = check.opt_nullable_list_param(object_types, "object_types", str)\n self.start_datetime = check.str_param(start_datetime, "start_datetime")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Netsuite", name)
\n\n\n
[docs]class HubplannerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Hubplanner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner\n\n Args:\n name (str): The name of the destination.\n api_key (str): Hubplanner API key. See https://github.com/hubplanner/API#authentication for more details.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Hubplanner", name)
\n\n\n
[docs]class Dv360Source(GeneratedAirbyteSource):\n
[docs] class Oauth2Credentials:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n token_uri: str,\n client_id: str,\n client_secret: str,\n ):\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.token_uri = check.str_param(token_uri, "token_uri")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "Dv360Source.Oauth2Credentials",\n partner_id: int,\n start_date: str,\n end_date: Optional[str] = None,\n filters: Optional[List[str]] = None,\n ):\n """Airbyte Source for Dv 360.\n\n Args:\n name (str): The name of the destination.\n credentials (Dv360Source.Oauth2Credentials): Oauth2 credentials\n partner_id (int): Partner ID\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n filters (Optional[List[str]]): filters for the dimensions. each filter object had 2 keys: 'type' for the name of the dimension to be used as. and 'value' for the value of the filter\n """\n self.credentials = check.inst_param(\n credentials, "credentials", Dv360Source.Oauth2Credentials\n )\n self.partner_id = check.int_param(partner_id, "partner_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.filters = check.opt_nullable_list_param(filters, "filters", str)\n super().__init__("Dv 360", name)
\n\n\n
[docs]class NotionSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_type = "OAuth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, token: str):\n self.auth_type = "token"\n self.token = check.str_param(token, "token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["NotionSource.OAuth20", "NotionSource.AccessToken"],\n ):\n """Airbyte Source for Notion.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/notion\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.\n credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]): Pick an authentication method.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (NotionSource.OAuth20, NotionSource.AccessToken)\n )\n super().__init__("Notion", name)
\n\n\n
[docs]class ZendeskSunshineSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_method = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str, email: str):\n self.auth_method = "api_token"\n self.api_token = check.str_param(api_token, "api_token")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n start_date: str,\n credentials: Union["ZendeskSunshineSource.OAuth20", "ZendeskSunshineSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Sunshine.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine\n\n Args:\n name (str): The name of the destination.\n subdomain (str): The subdomain for your Zendesk Account.\n start_date (str): The date from which you'd like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSunshineSource.OAuth20, ZendeskSunshineSource.APIToken),\n )\n super().__init__("Zendesk Sunshine", name)
\n\n\n
[docs]class PinterestSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.auth_method = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["PinterestSource.OAuth20", "PinterestSource.AccessToken"],\n ):\n """Airbyte Source for Pinterest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest\n\n Args:\n name (str): The name of the destination.\n start_date (str): A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (PinterestSource.OAuth20, PinterestSource.AccessToken)\n )\n super().__init__("Pinterest", name)
\n\n\n
[docs]class MetabaseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_api_url: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n session_token: Optional[str] = None,\n ):\n r"""Airbyte Source for Metabase.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase\n\n Args:\n name (str): The name of the destination.\n instance_api_url (str): URL to your metabase instance API\n session_token (Optional[str]): To generate your session token, you need to run the following command: ``` curl -X POST \\\\ -H "Content-Type: application/json" \\\\ -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\ http://localhost:3000/api/session ``` Then copy the value of the `id` field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.\n """\n self.instance_api_url = check.str_param(instance_api_url, "instance_api_url")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.session_token = check.opt_str_param(session_token, "session_token")\n super().__init__("Metabase", name)
\n\n\n
[docs]class HubspotSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.credentials_title = "OAuth Credentials"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.credentials_title = "API Key Credentials"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] class PrivateAPP:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials_title = "Private App Credentials"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union[\n "HubspotSource.OAuth", "HubspotSource.APIKey", "HubspotSource.PrivateAPP"\n ],\n ):\n """Airbyte Source for Hubspot.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]): Choose how to authenticate to HubSpot.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP),\n )\n super().__init__("Hubspot", name)
\n\n\n
[docs]class HarvestSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaHarvestOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithPersonalAccessToken:\n
[docs] @public\n def __init__(self, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n replication_start_date: str,\n credentials: Union[\n "HarvestSource.AuthenticateViaHarvestOAuth",\n "HarvestSource.AuthenticateWithPersonalAccessToken",\n ],\n ):\n """Airbyte Source for Harvest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest\n\n Args:\n name (str): The name of the destination.\n account_id (str): Harvest account ID. Required for all Harvest requests in pair with Personal Access Token\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]): Choose how to authenticate to Harvest.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n HarvestSource.AuthenticateViaHarvestOAuth,\n HarvestSource.AuthenticateWithPersonalAccessToken,\n ),\n )\n super().__init__("Harvest", name)
\n\n\n
[docs]class GithubSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, access_token: str):\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["GithubSource.OAuthCredentials", "GithubSource.PATCredentials"],\n start_date: str,\n repository: str,\n branch: Optional[str] = None,\n page_size_for_large_streams: Optional[int] = None,\n ):\n """Airbyte Source for Github.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/github\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]): Choose how to authenticate to GitHub\n start_date (str): The date from which you'd like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn't apply to all streams, see the docs for more info\n repository (str): Space-delimited list of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/airbyte airbytehq/another-repo` for multiple repositories.\n branch (Optional[str]): Space-delimited list of GitHub repository branches to pull commits for, e.g. `airbytehq/airbyte/master`. If no branches are specified for a repository, the default branch will be pulled.\n page_size_for_large_streams (Optional[int]): The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (GithubSource.OAuthCredentials, GithubSource.PATCredentials)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.repository = check.str_param(repository, "repository")\n self.branch = check.opt_str_param(branch, "branch")\n self.page_size_for_large_streams = check.opt_int_param(\n page_size_for_large_streams, "page_size_for_large_streams"\n )\n super().__init__("Github", name)
\n\n\n
[docs]class E2eTestSource(GeneratedAirbyteSource):\n
[docs] class SingleSchema:\n
[docs] @public\n def __init__(\n self, stream_name: str, stream_schema: str, stream_duplication: Optional[int] = None\n ):\n self.type = "SINGLE_STREAM"\n self.stream_name = check.str_param(stream_name, "stream_name")\n self.stream_schema = check.str_param(stream_schema, "stream_schema")\n self.stream_duplication = check.opt_int_param(stream_duplication, "stream_duplication")
\n\n
[docs] class MultiSchema:\n
[docs] @public\n def __init__(self, stream_schemas: str):\n self.type = "MULTI_STREAM"\n self.stream_schemas = check.str_param(stream_schemas, "stream_schemas")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n max_messages: int,\n mock_catalog: Union["E2eTestSource.SingleSchema", "E2eTestSource.MultiSchema"],\n type: Optional[str] = None,\n seed: Optional[int] = None,\n message_interval_ms: Optional[int] = None,\n ):\n """Airbyte Source for E2e Test.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test\n\n Args:\n name (str): The name of the destination.\n max_messages (int): Number of records to emit per stream. Min 1. Max 100 billion.\n seed (Optional[int]): When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].\n message_interval_ms (Optional[int]): Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).\n """\n self.type = check.opt_str_param(type, "type")\n self.max_messages = check.int_param(max_messages, "max_messages")\n self.seed = check.opt_int_param(seed, "seed")\n self.message_interval_ms = check.opt_int_param(message_interval_ms, "message_interval_ms")\n self.mock_catalog = check.inst_param(\n mock_catalog, "mock_catalog", (E2eTestSource.SingleSchema, E2eTestSource.MultiSchema)\n )\n super().__init__("E2e Test", name)
\n\n\n
[docs]class MysqlSource(GeneratedAirbyteSource):\n
[docs] class Preferred:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "preferred"
\n\n
[docs] class Required:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "required"
\n\n
[docs] class VerifyCA:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyIdentity:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_identity"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n initial_waiting_seconds: Optional[int] = None,\n server_time_zone: Optional[str] = None,\n ):\n self.method = "CDC"\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )\n self.server_time_zone = check.opt_str_param(server_time_zone, "server_time_zone")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "MysqlSource.Preferred",\n "MysqlSource.Required",\n "MysqlSource.VerifyCA",\n "MysqlSource.VerifyIdentity",\n ],\n replication_method: Union["MysqlSource.Standard", "MysqlSource.LogicalReplicationCDC"],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the database.\n port (int): The port to connect to.\n database (str): The database name.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]): SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.\n replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]): Replication method to use for extracting data from the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n MysqlSource.Preferred,\n MysqlSource.Required,\n MysqlSource.VerifyCA,\n MysqlSource.VerifyIdentity,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MysqlSource.Standard, MysqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mysql", name)
\n\n\n
[docs]class MyHoursSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n email: str,\n password: str,\n start_date: str,\n logs_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for My Hours.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours\n\n Args:\n name (str): The name of the destination.\n email (str): Your My Hours username\n password (str): The password associated to the username\n start_date (str): Start date for collecting time logs\n logs_batch_size (Optional[int]): Pagination size used for retrieving logs in days\n """\n self.email = check.str_param(email, "email")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.logs_batch_size = check.opt_int_param(logs_batch_size, "logs_batch_size")\n super().__init__("My Hours", name)
\n\n\n
[docs]class KyribaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n username: str,\n password: str,\n start_date: str,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Kyriba.\n\n Args:\n name (str): The name of the destination.\n domain (str): Kyriba domain\n username (str): Username to be used in basic auth\n password (str): Password to be used in basic auth\n start_date (str): The date the sync should start from.\n end_date (Optional[str]): The date the sync should end. If let empty the sync will run to the current date.\n """\n self.domain = check.str_param(domain, "domain")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Kyriba", name)
\n\n\n
[docs]class GoogleSearchConsoleSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str, email: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n site_urls: List[str],\n start_date: str,\n authorization: Union[\n "GoogleSearchConsoleSource.OAuth",\n "GoogleSearchConsoleSource.ServiceAccountKeyAuthentication",\n ],\n end_date: Optional[str] = None,\n custom_reports: Optional[str] = None,\n ):\n """Airbyte Source for Google Search Console.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console\n\n Args:\n name (str): The name of the destination.\n site_urls (List[str]): The URLs of the website property attached to your GSC account. Read more here.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.\n """\n self.site_urls = check.list_param(site_urls, "site_urls", str)\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (\n GoogleSearchConsoleSource.OAuth,\n GoogleSearchConsoleSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n super().__init__("Google Search Console", name)
\n\n\n
[docs]class FacebookMarketingSource(GeneratedAirbyteSource):\n
[docs] class InsightConfig:\n
[docs] @public\n def __init__(\n self,\n name: str,\n fields: Optional[List[str]] = None,\n breakdowns: Optional[List[str]] = None,\n action_breakdowns: Optional[List[str]] = None,\n time_increment: Optional[int] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n insights_lookback_window: Optional[int] = None,\n ):\n self.name = check.str_param(name, "name")\n self.fields = check.opt_nullable_list_param(fields, "fields", str)\n self.breakdowns = check.opt_nullable_list_param(breakdowns, "breakdowns", str)\n self.action_breakdowns = check.opt_nullable_list_param(\n action_breakdowns, "action_breakdowns", str\n )\n self.time_increment = check.opt_int_param(time_increment, "time_increment")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n start_date: str,\n access_token: str,\n end_date: Optional[str] = None,\n include_deleted: Optional[bool] = None,\n fetch_thumbnail_images: Optional[bool] = None,\n custom_insights: Optional[List[InsightConfig]] = None,\n page_size: Optional[int] = None,\n insights_lookback_window: Optional[int] = None,\n max_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Facebook Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing\n\n Args:\n name (str): The name of the destination.\n account_id (str): The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.\n start_date (str): The date from which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.\n access_token (str): The value of the access token generated. See the docs for more information\n include_deleted (Optional[bool]): Include data from deleted Campaigns, Ads, and AdSets\n fetch_thumbnail_images (Optional[bool]): In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url\n custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]): A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)\n page_size (Optional[int]): Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n insights_lookback_window (Optional[int]): The attribution window\n max_batch_size (Optional[int]): Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.access_token = check.str_param(access_token, "access_token")\n self.include_deleted = check.opt_bool_param(include_deleted, "include_deleted")\n self.fetch_thumbnail_images = check.opt_bool_param(\n fetch_thumbnail_images, "fetch_thumbnail_images"\n )\n self.custom_insights = check.opt_nullable_list_param(\n custom_insights, "custom_insights", FacebookMarketingSource.InsightConfig\n )\n self.page_size = check.opt_int_param(page_size, "page_size")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n super().__init__("Facebook Marketing", name)
\n\n\n
[docs]class SurveymonkeySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, start_date: str, survey_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Surveymonkey.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey\n\n Args:\n name (str): The name of the destination.\n access_token (str): Access Token for making authenticated requests. See the docs for information on how to generate this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all boards to which you have access will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Surveymonkey", name)
\n\n\n
[docs]class PardotSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n pardot_business_unit_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n is_sandbox: Optional[bool] = None,\n ):\n """Airbyte Source for Pardot.\n\n Args:\n name (str): The name of the destination.\n pardot_business_unit_id (str): Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup\n client_id (str): The Consumer Key that can be found when viewing your app in Salesforce\n client_secret (str): The Consumer Secret that can be found when viewing your app in Salesforce\n refresh_token (str): Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don't know what this is, follow this guide to retrieve it.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter\n is_sandbox (Optional[bool]): Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.\n """\n self.pardot_business_unit_id = check.str_param(\n pardot_business_unit_id, "pardot_business_unit_id"\n )\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n super().__init__("Pardot", name)
\n\n\n
[docs]class FlexportSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Flexport.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport\n\n Args:\n name (str): The name of the destination.\n\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Flexport", name)
\n\n\n
[docs]class ZenefitsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: str):\n """Airbyte Source for Zenefits.\n\n Args:\n name (str): The name of the destination.\n token (str): Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api\n """\n self.token = check.str_param(token, "token")\n super().__init__("Zenefits", name)
\n\n\n
[docs]class KafkaSource(GeneratedAirbyteSource):\n
[docs] class JSON:\n
[docs] @public\n def __init__(self, deserialization_type: Optional[str] = None):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )
\n\n
[docs] class AVRO:\n
[docs] @public\n def __init__(\n self,\n deserialization_type: Optional[str] = None,\n deserialization_strategy: Optional[str] = None,\n schema_registry_url: Optional[str] = None,\n schema_registry_username: Optional[str] = None,\n schema_registry_password: Optional[str] = None,\n ):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )\n self.deserialization_strategy = check.opt_str_param(\n deserialization_strategy, "deserialization_strategy"\n )\n self.schema_registry_url = check.opt_str_param(\n schema_registry_url, "schema_registry_url"\n )\n self.schema_registry_username = check.opt_str_param(\n schema_registry_username, "schema_registry_username"\n )\n self.schema_registry_password = check.opt_str_param(\n schema_registry_password, "schema_registry_password"\n )
\n\n
[docs] class ManuallyAssignAListOfPartitions:\n
[docs] @public\n def __init__(self, topic_partitions: str):\n self.subscription_type = "assign"\n self.topic_partitions = check.str_param(topic_partitions, "topic_partitions")
\n\n
[docs] class SubscribeToAllTopicsMatchingSpecifiedPattern:\n
[docs] @public\n def __init__(self, topic_pattern: str):\n self.subscription_type = "subscribe"\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")
\n\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n MessageFormat: Union["KafkaSource.JSON", "KafkaSource.AVRO"],\n bootstrap_servers: str,\n subscription: Union[\n "KafkaSource.ManuallyAssignAListOfPartitions",\n "KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern",\n ],\n protocol: Union[\n "KafkaSource.PLAINTEXT", "KafkaSource.SASLPLAINTEXT", "KafkaSource.SASLSSL"\n ],\n test_topic: Optional[str] = None,\n group_id: Optional[str] = None,\n max_poll_records: Optional[int] = None,\n polling_time: Optional[int] = None,\n client_id: Optional[str] = None,\n enable_auto_commit: Optional[bool] = None,\n auto_commit_interval_ms: Optional[int] = None,\n client_dns_lookup: Optional[str] = None,\n retry_backoff_ms: Optional[int] = None,\n request_timeout_ms: Optional[int] = None,\n receive_buffer_bytes: Optional[int] = None,\n auto_offset_reset: Optional[str] = None,\n repeated_calls: Optional[int] = None,\n max_records_process: Optional[int] = None,\n ):\n """Airbyte Source for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka\n\n Args:\n name (str): The name of the destination.\n MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]): The serialization used based on this\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]): You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.\n test_topic (Optional[str]): The Topic to test in case the Airbyte can consume messages.\n group_id (Optional[str]): The Group ID is how you distinguish different consumer groups.\n max_poll_records (Optional[int]): The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.\n polling_time (Optional[int]): Amount of time Kafka connector should try to poll for messages.\n protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]): The Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n enable_auto_commit (Optional[bool]): If true, the consumer's offset will be periodically committed in the background.\n auto_commit_interval_ms (Optional[int]): The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.\n client_dns_lookup (Optional[str]): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n retry_backoff_ms (Optional[int]): The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.\n request_timeout_ms (Optional[int]): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n receive_buffer_bytes (Optional[int]): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n auto_offset_reset (Optional[str]): What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer's group, anything else: throw exception to the consumer.\n repeated_calls (Optional[int]): The number of repeated calls to poll() if no messages were received.\n max_records_process (Optional[int]): The Maximum to be processed per execution\n """\n self.MessageFormat = check.inst_param(\n MessageFormat, "MessageFormat", (KafkaSource.JSON, KafkaSource.AVRO)\n )\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.subscription = check.inst_param(\n subscription,\n "subscription",\n (\n KafkaSource.ManuallyAssignAListOfPartitions,\n KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern,\n ),\n )\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.group_id = check.opt_str_param(group_id, "group_id")\n self.max_poll_records = check.opt_int_param(max_poll_records, "max_poll_records")\n self.polling_time = check.opt_int_param(polling_time, "polling_time")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.enable_auto_commit = check.opt_bool_param(enable_auto_commit, "enable_auto_commit")\n self.auto_commit_interval_ms = check.opt_int_param(\n auto_commit_interval_ms, "auto_commit_interval_ms"\n )\n self.client_dns_lookup = check.opt_str_param(client_dns_lookup, "client_dns_lookup")\n self.retry_backoff_ms = check.opt_int_param(retry_backoff_ms, "retry_backoff_ms")\n self.request_timeout_ms = check.opt_int_param(request_timeout_ms, "request_timeout_ms")\n self.receive_buffer_bytes = check.opt_int_param(\n receive_buffer_bytes, "receive_buffer_bytes"\n )\n self.auto_offset_reset = check.opt_str_param(auto_offset_reset, "auto_offset_reset")\n self.repeated_calls = check.opt_int_param(repeated_calls, "repeated_calls")\n self.max_records_process = check.opt_int_param(max_records_process, "max_records_process")\n super().__init__("Kafka", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/sources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.sources"}}, "reconciliation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.reconciliation

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import AssetKey\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster_managed_elements import (\n    ManagedElementCheckResult,\n    ManagedElementDiff,\n    ManagedElementError,\n)\nfrom dagster_managed_elements.types import (\n    SECRET_MASK_VALUE,\n    ManagedElementReconciler,\n    is_key_secret,\n)\nfrom dagster_managed_elements.utils import UNSET, diff_dicts\n\nfrom dagster_airbyte.asset_defs import (\n    AirbyteConnectionMetadata,\n    AirbyteInstanceCacheableAssetsDefinition,\n    _clean_name,\n)\nfrom dagster_airbyte.managed.types import (\n    AirbyteConnection,\n    AirbyteDestination,\n    AirbyteDestinationNamespace,\n    AirbyteSource,\n    AirbyteSyncMode,\n    InitializedAirbyteConnection,\n    InitializedAirbyteDestination,\n    InitializedAirbyteSource,\n)\nfrom dagster_airbyte.resources import AirbyteResource\nfrom dagster_airbyte.utils import is_basic_normalization_operation\n\n\ndef gen_configured_stream_json(\n    source_stream: Mapping[str, Any], user_stream_config: Mapping[str, AirbyteSyncMode]\n) -> Mapping[str, Any]:\n    """Generates an Airbyte API stream defintiion based on the succinct user-provided config and the\n    full stream definition from the source.\n    """\n    config = user_stream_config[source_stream["stream"]["name"]]\n    return deep_merge_dicts(\n        source_stream,\n        {"config": config.to_json()},\n    )\n\n\ndef _ignore_secrets_compare_fn(k: str, _cv: Any, dv: Any) -> Optional[bool]:\n    if is_key_secret(k):\n        return dv == SECRET_MASK_VALUE\n    return None\n\n\ndef _diff_configs(\n    config_dict: Mapping[str, Any], dst_dict: Mapping[str, Any], ignore_secrets: bool = True\n) -> ManagedElementDiff:\n    return diff_dicts(\n        config_dict=config_dict,\n        dst_dict=dst_dict,\n        custom_compare_fn=_ignore_secrets_compare_fn if ignore_secrets else None,\n    )\n\n\ndef diff_sources(\n    config_src: Optional[AirbyteSource],\n    curr_src: Optional[AirbyteSource],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteSource objects."""\n    diff = _diff_configs(\n        config_src.source_configuration if config_src else {},\n        curr_src.source_configuration if curr_src else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_src.name if config_src else curr_src.name if curr_src else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef diff_destinations(\n    config_dst: Optional[AirbyteDestination],\n    curr_dst: Optional[AirbyteDestination],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteDestination objects."""\n    diff = _diff_configs(\n        config_dst.destination_configuration if config_dst else {},\n        curr_dst.destination_configuration if curr_dst else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_dst.name if config_dst else curr_dst.name if curr_dst else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef conn_dict(conn: Optional[AirbyteConnection]) -> Mapping[str, Any]:\n    if not conn:\n        return {}\n    return {\n        "source": conn.source.name if conn.source else "Unknown",\n        "destination": conn.destination.name if conn.destination else "Unknown",\n        "normalize data": conn.normalize_data,\n        "streams": {k: v.to_json() for k, v in conn.stream_config.items()},\n        "destination namespace": (\n            conn.destination_namespace.name\n            if isinstance(conn.destination_namespace, AirbyteDestinationNamespace)\n            else conn.destination_namespace\n        ),\n        "prefix": conn.prefix,\n    }\n\n\nOPTIONAL_STREAM_SETTINGS = ("cursorField", "primaryKey")\n\n\ndef _compare_stream_values(k: str, cv: str, _dv: str):\n    """Don't register a diff for optional stream settings if the value is not set\n    in the user-provided config, this means it will default to the value in the\n    source.\n    """\n    return True if k in OPTIONAL_STREAM_SETTINGS and cv == UNSET else None\n\n\ndef diff_connections(\n    config_conn: Optional[AirbyteConnection], curr_conn: Optional[AirbyteConnection]\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteConnection objects."""\n    diff = diff_dicts(\n        conn_dict(config_conn),\n        conn_dict(curr_conn),\n        custom_compare_fn=_compare_stream_values,\n    )\n    if not diff.is_empty():\n        name = config_conn.name if config_conn else curr_conn.name if curr_conn else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef reconcile_sources(\n    res: AirbyteResource,\n    config_sources: Mapping[str, AirbyteSource],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteSource], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing sources and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_sources: Dict[str, InitializedAirbyteSource] = {}\n    for source_name in set(config_sources.keys()).union(existing_sources.keys()):\n        configured_source = config_sources.get(source_name)\n        existing_source = existing_sources.get(source_name)\n\n        # Ignore sources not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_source and not configured_source:\n            initialized_sources[source_name] = existing_source\n            continue\n\n        diff = diff.join(\n            diff_sources(  # type: ignore\n                configured_source,\n                existing_source.source if existing_source else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_source and (\n            not configured_source or (configured_source.must_be_recreated(existing_source.source))\n        ):\n            initialized_sources[source_name] = existing_source\n            if not dry_run:\n                res.make_request(\n                    endpoint="/sources/delete",\n                    data={"sourceId": existing_source.source_id},\n                )\n            existing_source = None\n\n        if configured_source:\n            defn_id = check.not_none(\n                res.get_source_definition_by_name(configured_source.source_type)\n            )\n            base_source_defn_dict = {\n                "name": configured_source.name,\n                "connectionConfiguration": configured_source.source_configuration,\n            }\n            source_id = ""\n            if existing_source:\n                source_id = existing_source.source_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/sources/update",\n                        data={"sourceId": source_id, **base_source_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/sources/create",\n                                data={\n                                    "sourceDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_source_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    source_id = create_result["sourceId"]\n\n            if source_name in initialized_sources:\n                # Preserve to be able to initialize old connection object\n                initialized_sources[f"{source_name}_old"] = initialized_sources[source_name]\n            initialized_sources[source_name] = InitializedAirbyteSource(\n                source=configured_source,\n                source_id=source_id,\n                source_definition_id=defn_id,\n            )\n    return initialized_sources, diff\n\n\ndef reconcile_destinations(\n    res: AirbyteResource,\n    config_destinations: Mapping[str, AirbyteDestination],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteDestination], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing destinations and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_destinations: Dict[str, InitializedAirbyteDestination] = {}\n    for destination_name in set(config_destinations.keys()).union(existing_destinations.keys()):\n        configured_destination = config_destinations.get(destination_name)\n        existing_destination = existing_destinations.get(destination_name)\n\n        # Ignore destinations not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_destination and not configured_destination:\n            initialized_destinations[destination_name] = existing_destination\n            continue\n\n        diff = diff.join(\n            diff_destinations(  # type: ignore\n                configured_destination,\n                existing_destination.destination if existing_destination else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_destination and (\n            not configured_destination\n            or (configured_destination.must_be_recreated(existing_destination.destination))\n        ):\n            initialized_destinations[destination_name] = existing_destination\n            if not dry_run:\n                res.make_request(\n                    endpoint="/destinations/delete",\n                    data={"destinationId": existing_destination.destination_id},\n                )\n            existing_destination = None\n\n        if configured_destination:\n            defn_id = res.get_destination_definition_by_name(\n                configured_destination.destination_type\n            )\n            base_destination_defn_dict = {\n                "name": configured_destination.name,\n                "connectionConfiguration": configured_destination.destination_configuration,\n            }\n            destination_id = ""\n            if existing_destination:\n                destination_id = existing_destination.destination_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/destinations/update",\n                        data={"destinationId": destination_id, **base_destination_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/destinations/create",\n                                data={\n                                    "destinationDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_destination_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    destination_id = create_result["destinationId"]\n\n            if destination_name in initialized_destinations:\n                # Preserve to be able to initialize old connection object\n                initialized_destinations[f"{destination_name}_old"] = initialized_destinations[\n                    destination_name\n                ]\n            initialized_destinations[destination_name] = InitializedAirbyteDestination(\n                destination=configured_destination,\n                destination_id=destination_id,\n                destination_definition_id=defn_id,\n            )\n    return initialized_destinations, diff\n\n\ndef reconcile_config(\n    res: AirbyteResource,\n    objects: Sequence[AirbyteConnection],\n    dry_run: bool = False,\n    should_delete: bool = False,\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Main entry point for the reconciliation process. Takes a list of AirbyteConnection objects\n    and a pointer to an Airbyte instance and returns a diff, along with applying the diff\n    if dry_run is False.\n    """\n    with res.cache_requests():\n        config_connections = {conn.name: conn for conn in objects}\n        config_sources = {conn.source.name: conn.source for conn in objects}\n        config_dests = {conn.destination.name: conn.destination for conn in objects}\n\n        workspace_id = res.get_default_workspace()\n\n        existing_sources_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/sources/list", data={"workspaceId": workspace_id})\n            ),\n        )\n        existing_dests_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/destinations/list", data={"workspaceId": workspace_id})\n            ),\n        )\n\n        existing_sources: Dict[str, InitializedAirbyteSource] = {\n            source_json["name"]: InitializedAirbyteSource.from_api_json(source_json)\n            for source_json in existing_sources_raw.get("sources", [])\n        }\n        existing_dests: Dict[str, InitializedAirbyteDestination] = {\n            destination_json["name"]: InitializedAirbyteDestination.from_api_json(destination_json)\n            for destination_json in existing_dests_raw.get("destinations", [])\n        }\n\n        # First, remove any connections that need to be deleted, so that we can\n        # safely delete any sources/destinations that are no longer referenced\n        # or that need to be recreated.\n        connections_diff = reconcile_connections_pre(\n            res,\n            config_connections,\n            existing_sources,\n            existing_dests,\n            workspace_id,\n            dry_run,\n            should_delete,\n        )\n\n        all_sources, sources_diff = reconcile_sources(\n            res,\n            config_sources,\n            existing_sources,\n            workspace_id,\n            dry_run,\n            should_delete,\n            ignore_secrets,\n        )\n        all_dests, dests_diff = reconcile_destinations(\n            res, config_dests, existing_dests, workspace_id, dry_run, should_delete, ignore_secrets\n        )\n\n        # Now that we have updated the set of sources and destinations, we can\n        # recreate or update any connections which depend on them.\n        reconcile_connections_post(\n            res,\n            config_connections,\n            all_sources,\n            all_dests,\n            workspace_id,\n            dry_run,\n        )\n\n        return ManagedElementDiff().join(sources_diff).join(dests_diff).join(connections_diff)  # type: ignore\n\n\ndef reconcile_normalization(\n    res: AirbyteResource,\n    existing_connection_id: Optional[str],\n    destination: InitializedAirbyteDestination,\n    normalization_config: Optional[bool],\n    workspace_id: str,\n) -> Optional[str]:\n    """Reconciles the normalization configuration for a connection.\n\n    If normalization_config is None, then defaults to True on destinations that support normalization\n    and False on destinations that do not.\n    """\n    existing_basic_norm_op_id = None\n    if existing_connection_id:\n        operations = cast(\n            Dict[str, List[Dict[str, str]]],\n            check.not_none(\n                res.make_request(\n                    endpoint="/operations/list",\n                    data={"connectionId": existing_connection_id},\n                )\n            ),\n        )\n        existing_basic_norm_op = next(\n            (\n                operation\n                for operation in operations["operations"]\n                if is_basic_normalization_operation(operation)\n            ),\n            None,\n        )\n        existing_basic_norm_op_id = (\n            existing_basic_norm_op["operationId"] if existing_basic_norm_op else None\n        )\n\n    if normalization_config is not False:\n        if destination.destination_definition_id and res.does_dest_support_normalization(\n            destination.destination_definition_id, workspace_id\n        ):\n            if existing_basic_norm_op_id:\n                return existing_basic_norm_op_id\n            else:\n                return cast(\n                    Dict[str, str],\n                    check.not_none(\n                        res.make_request(\n                            endpoint="/operations/create",\n                            data={\n                                "workspaceId": workspace_id,\n                                "name": "Normalization",\n                                "operatorConfiguration": {\n                                    "operatorType": "normalization",\n                                    "normalization": {"option": "basic"},\n                                },\n                            },\n                        )\n                    ),\n                )["operationId"]\n        elif normalization_config is True:\n            raise Exception(\n                f"Destination {destination.destination.name} does not support normalization."\n            )\n\n    return None\n\n\ndef reconcile_connections_pre(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n) -> ManagedElementCheckResult:\n    """Generates the diff for connections, and deletes any connections that are not in the config if\n    dry_run is False.\n\n    It's necessary to do this in two steps because we need to remove connections that depend on\n    sources and destinations that are being deleted or recreated before Airbyte will allow us to\n    delete or recreate them.\n    """\n    diff = ManagedElementDiff()\n\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections: Dict[str, InitializedAirbyteConnection] = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, existing_sources, existing_destinations\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name in set(config_connections.keys()).union(existing_connections.keys()):\n        config_conn = config_connections.get(conn_name)\n        existing_conn = existing_connections.get(conn_name)\n\n        # Ignore connections not mentioned in the user config unless the user specifies to delete\n        if not should_delete and not config_conn:\n            continue\n\n        diff = diff.join(\n            diff_connections(config_conn, existing_conn.connection if existing_conn else None)  # type: ignore\n        )\n\n        if existing_conn and (\n            not config_conn or config_conn.must_be_recreated(existing_conn.connection)\n        ):\n            if not dry_run:\n                res.make_request(\n                    endpoint="/connections/delete",\n                    data={"connectionId": existing_conn.connection_id},\n                )\n    return diff\n\n\ndef reconcile_connections_post(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    init_sources: Mapping[str, InitializedAirbyteSource],\n    init_dests: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n) -> None:\n    """Creates new and modifies existing connections based on the config if dry_run is False."""\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, init_sources, init_dests\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name, config_conn in config_connections.items():\n        existing_conn = existing_connections.get(conn_name)\n\n        normalization_operation_id = None\n        if not dry_run:\n            destination = init_dests[config_conn.destination.name]\n\n            # Enable or disable basic normalization based on config\n            normalization_operation_id = reconcile_normalization(\n                res,\n                existing_connections.get("name", {}).get("connectionId"),\n                destination,\n                config_conn.normalize_data,\n                workspace_id,\n            )\n\n        configured_streams = []\n        if not dry_run:\n            source = init_sources[config_conn.source.name]\n            schema = res.get_source_schema(source.source_id)\n            base_streams = schema["catalog"]["streams"]\n\n            configured_streams = [\n                gen_configured_stream_json(stream, config_conn.stream_config)\n                for stream in base_streams\n                if stream["stream"]["name"] in config_conn.stream_config\n            ]\n\n        connection_base_json = {\n            "name": conn_name,\n            "namespaceDefinition": "source",\n            "namespaceFormat": "${SOURCE_NAMESPACE}",\n            "prefix": "",\n            "operationIds": [normalization_operation_id] if normalization_operation_id else [],\n            "syncCatalog": {"streams": configured_streams},\n            "scheduleType": "manual",\n            "status": "active",\n        }\n\n        if isinstance(config_conn.destination_namespace, AirbyteDestinationNamespace):\n            connection_base_json["namespaceDefinition"] = config_conn.destination_namespace.value\n        else:\n            connection_base_json["namespaceDefinition"] = "customformat"\n            connection_base_json["namespaceFormat"] = cast(str, config_conn.destination_namespace)\n\n        if config_conn.prefix:\n            connection_base_json["prefix"] = config_conn.prefix\n\n        if existing_conn:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                res.make_request(\n                    endpoint="/connections/update",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "connectionId": existing_conn.connection_id,\n                    },\n                )\n        else:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                destination = init_dests[config_conn.destination.name]\n\n                res.make_request(\n                    endpoint="/connections/create",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "sourceId": source.source_id,\n                        "destinationId": destination.destination_id,\n                    },\n                )\n\n\n
[docs]@experimental\nclass AirbyteManagedElementReconciler(ManagedElementReconciler):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\n CLI will allow you to check the state of your Python-code-specified Airbyte connections\n against an Airbyte instance, and reconcile them if necessary.\n\n This functionality is experimental and subject to change.\n """\n\n
[docs] @public\n def __init__(\n self,\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n delete_unmentioned_resources: bool = False,\n ):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): The Airbyte resource definition to reconcile against.\n connections (Iterable[AirbyteConnection]): The Airbyte connection objects to reconcile.\n delete_unmentioned_resources (bool): Whether to delete resources that are not mentioned in\n the set of connections provided. When True, all Airbyte instance contents are effectively\n managed by the reconciler. Defaults to False.\n """\n # airbyte = check.inst_param(airbyte, "airbyte", ResourceDefinition)\n\n self._airbyte_instance: AirbyteResource = (\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n )\n self._connections = list(\n check.iterable_param(connections, "connections", of_type=AirbyteConnection)\n )\n self._delete_unmentioned_resources = check.bool_param(\n delete_unmentioned_resources, "delete_unmentioned_resources"\n )\n\n super().__init__()
\n\n def check(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=True,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )\n\n def apply(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=False,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )
\n\n\nclass AirbyteManagedElementCacheableAssetsDefinition(AirbyteInstanceCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: AirbyteResource,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connections: Iterable[AirbyteConnection],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n ):\n defined_conn_names = {conn.name for conn in connections}\n super().__init__(\n airbyte_resource_def=airbyte_resource_def,\n workspace_id=None,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=lambda conn: conn.name in defined_conn_names,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )\n self._connections: List[AirbyteConnection] = list(connections)\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n diff = reconcile_config(self._airbyte_instance, self._connections, dry_run=True)\n if isinstance(diff, ManagedElementDiff) and not diff.is_empty():\n raise ValueError(\n "Airbyte connections are not in sync with provided configuration, diff:\\n{}".format(\n str(diff)\n )\n )\n elif isinstance(diff, ManagedElementError):\n raise ValueError(f"Error checking Airbyte connections: {diff}")\n\n return super()._get_connections()\n\n\n
[docs]@experimental\ndef load_assets_from_connections(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\n This method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): An AirbyteResource configured with the appropriate connection\n details.\n connections (Iterable[AirbyteConnection]): A list of AirbyteConnection objects to build assets for.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function which\n takes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster_airbyte import (\n AirbyteConnection,\n AirbyteResource,\n load_assets_from_connections,\n )\n\n airbyte_instance = AirbyteResource(\n host: "localhost",\n port: "8000",\n )\n airbyte_connections = [\n AirbyteConnection(...),\n AirbyteConnection(...)\n ]\n airbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteManagedElementCacheableAssetsDefinition(\n airbyte_resource_def=(\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n ),\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=check.bool_param(\n create_assets_for_normalization_tables, "create_assets_for_normalization_tables"\n ),\n connection_to_group_fn=check.opt_callable_param(\n connection_to_group_fn, "connection_to_group_fn"\n ),\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connections=check.iterable_param(connections, "connections", of_type=AirbyteConnection),\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/managed/reconciliation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.reconciliation"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.types

\nimport json\nfrom abc import ABC\nfrom enum import Enum\nfrom typing import Any, Dict, List, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\n\n
[docs]class AirbyteSyncMode(ABC):\n """Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\n from a source and writes to a destination.\n\n For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.\n """\n\n def __eq__(self, other: Any) -> bool:\n return isinstance(other, AirbyteSyncMode) and self.to_json() == other.to_json()\n\n def __init__(self, json_repr: Dict[str, Any]):\n self.json_repr = json_repr\n\n def to_json(self) -> Dict[str, Any]:\n return self.json_repr\n\n @classmethod\n def from_json(cls, json_repr: Dict[str, Any]) -> "AirbyteSyncMode":\n return cls(\n {\n k: v\n for k, v in json_repr.items()\n if k in ("syncMode", "destinationSyncMode", "cursorField", "primaryKey")\n }\n )\n\n
[docs] @public\n @classmethod\n def full_refresh_append(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, appending rows to the destination.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "append"})
\n\n
[docs] @public\n @classmethod\n def full_refresh_overwrite(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, replaces data in the destination by\n overwriting it.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "overwrite"})
\n\n
[docs] @public\n @classmethod\n def incremental_append(\n cls,\n cursor_field: Optional[str] = None,\n ) -> "AirbyteSyncMode":\n """Syncs only new records from the source, appending rows to the destination.\n May optionally specify the cursor field used to determine which records\n are new.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n }\n )
\n\n
[docs] @public\n @classmethod\n def incremental_append_dedup(\n cls,\n cursor_field: Optional[str] = None,\n primary_key: Optional[Union[str, List[str]]] = None,\n ) -> "AirbyteSyncMode":\n """Syncs new records from the source, appending to an append-only history\n table in the destination. Also generates a deduplicated view mirroring the\n source table. May optionally specify the cursor field used to determine\n which records are new, and the primary key used to determine which records\n are duplicates.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n if isinstance(primary_key, str):\n primary_key = [primary_key]\n primary_key = check.opt_list_param(primary_key, "primary_key", of_type=str)\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append_dedup",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n **({"primaryKey": [[x] for x in primary_key]} if primary_key else {}),\n }\n )
\n\n\n
[docs]class AirbyteSource:\n """Represents a user-defined Airbyte source.\n\n Args:\n name (str): The display name of the source.\n source_type (str): The type of the source, from Airbyte's list\n of sources https://airbytehq.github.io/category/sources/.\n source_configuration (Mapping[str, Any]): The configuration for the\n source, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(self, name: str, source_type: str, source_configuration: Mapping[str, Any]):\n self.name = check.str_param(name, "name")\n self.source_type = check.str_param(source_type, "source_type")\n self.source_configuration = check.mapping_param(\n source_configuration, "source_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteSource") -> bool:\n return self.name != other.name or self.source_type != other.source_type
\n\n\nclass InitializedAirbyteSource:\n """User-defined Airbyte source bound to actual created Airbyte source."""\n\n def __init__(self, source: AirbyteSource, source_id: str, source_definition_id: Optional[str]):\n self.source = source\n self.source_id = source_id\n self.source_definition_id = source_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n source=AirbyteSource(\n name=api_json["name"],\n source_type=api_json["sourceName"],\n source_configuration=api_json["connectionConfiguration"],\n ),\n source_id=api_json["sourceId"],\n source_definition_id=None,\n )\n\n\n
[docs]class AirbyteDestination:\n """Represents a user-defined Airbyte destination.\n\n Args:\n name (str): The display name of the destination.\n destination_type (str): The type of the destination, from Airbyte's list\n of destinations https://airbytehq.github.io/category/destinations/.\n destination_configuration (Mapping[str, Any]): The configuration for the\n destination, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(\n self, name: str, destination_type: str, destination_configuration: Mapping[str, Any]\n ):\n self.name = check.str_param(name, "name")\n self.destination_type = check.str_param(destination_type, "destination_type")\n self.destination_configuration = check.mapping_param(\n destination_configuration, "destination_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteDestination") -> bool:\n return self.name != other.name or self.destination_type != other.destination_type
\n\n\nclass InitializedAirbyteDestination:\n """User-defined Airbyte destination bound to actual created Airbyte destination."""\n\n def __init__(\n self,\n destination: AirbyteDestination,\n destination_id: str,\n destination_definition_id: Optional[str],\n ):\n self.destination = destination\n self.destination_id = destination_id\n self.destination_definition_id = destination_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n destination=AirbyteDestination(\n name=api_json["name"],\n destination_type=api_json["destinationName"],\n destination_configuration=api_json["connectionConfiguration"],\n ),\n destination_id=api_json["destinationId"],\n destination_definition_id=None,\n )\n\n\nclass AirbyteDestinationNamespace(Enum):\n """Represents the sync mode for a given Airbyte stream."""\n\n SAME_AS_SOURCE = "source"\n DESTINATION_DEFAULT = "destination"\n\n\n
[docs]class AirbyteConnection:\n """A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\n which streams to sync.\n\n Args:\n name (str): The display name of the connection.\n source (AirbyteSource): The source to sync from.\n destination (AirbyteDestination): The destination to sync to.\n stream_config (Mapping[str, AirbyteSyncMode]): A mapping from stream name to\n the sync mode for that stream, including any additional configuration\n of primary key or cursor field.\n normalize_data (Optional[bool]): Whether to normalize the data in the\n destination.\n destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]):\n The namespace to sync to in the destination. If set to\n AirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\n same as the source namespace. If set to\n AirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\n the default namespace for the destination. If set to a string, the\n namespace will be that string.\n prefix (Optional[str]): A prefix to add to the table names in the destination.\n\n Example:\n .. code-block:: python\n\n from dagster_airbyte.managed.generated.sources import FileSource\n from dagster_airbyte.managed.generated.destinations import LocalJsonDestination\n from dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\n cereals_csv_source = FileSource(...)\n local_json_destination = LocalJsonDestination(...)\n\n cereals_connection = AirbyteConnection(\n name="download-cereals",\n source=cereals_csv_source,\n destination=local_json_destination,\n stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n )\n """\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n source: AirbyteSource,\n destination: AirbyteDestination,\n stream_config: Mapping[str, AirbyteSyncMode],\n normalize_data: Optional[bool] = None,\n destination_namespace: Optional[\n Union[AirbyteDestinationNamespace, str]\n ] = AirbyteDestinationNamespace.SAME_AS_SOURCE,\n prefix: Optional[str] = None,\n ):\n self.name = check.str_param(name, "name")\n self.source = check.inst_param(source, "source", AirbyteSource)\n self.destination = check.inst_param(destination, "destination", AirbyteDestination)\n self.stream_config = check.mapping_param(\n stream_config, "stream_config", key_type=str, value_type=AirbyteSyncMode\n )\n self.normalize_data = check.opt_bool_param(normalize_data, "normalize_data")\n self.destination_namespace = check.opt_inst_param(\n destination_namespace, "destination_namespace", (str, AirbyteDestinationNamespace)\n )\n self.prefix = check.opt_str_param(prefix, "prefix")
\n\n def must_be_recreated(self, other: Optional["AirbyteConnection"]) -> bool:\n return (\n not other\n or self.source.must_be_recreated(other.source)\n or self.destination.must_be_recreated(other.destination)\n )
\n\n\nclass InitializedAirbyteConnection:\n """User-defined Airbyte connection bound to actual created Airbyte connection."""\n\n def __init__(\n self,\n connection: AirbyteConnection,\n connection_id: str,\n ):\n self.connection = connection\n self.connection_id = connection_id\n\n @classmethod\n def from_api_json(\n cls,\n api_dict: Mapping[str, Any],\n init_sources: Mapping[str, InitializedAirbyteSource],\n init_dests: Mapping[str, InitializedAirbyteDestination],\n ):\n source = next(\n (\n source.source\n for source in init_sources.values()\n if source.source_id == api_dict["sourceId"]\n ),\n None,\n )\n dest = next(\n (\n dest.destination\n for dest in init_dests.values()\n if dest.destination_id == api_dict["destinationId"]\n ),\n None,\n )\n\n source = check.not_none(source, f"Could not find source with id {api_dict['sourceId']}")\n dest = check.not_none(\n dest, f"Could not find destination with id {api_dict['destinationId']}"\n )\n\n streams = {\n stream["stream"]["name"]: AirbyteSyncMode.from_json(stream["config"])\n for stream in api_dict["syncCatalog"]["streams"]\n }\n return cls(\n AirbyteConnection(\n name=api_dict["name"],\n source=source,\n destination=dest,\n stream_config=streams,\n normalize_data=len(api_dict["operationIds"]) > 0,\n destination_namespace=(\n api_dict["namespaceFormat"]\n if api_dict["namespaceDefinition"] == "customformat"\n else AirbyteDestinationNamespace(api_dict["namespaceDefinition"])\n ),\n prefix=api_dict["prefix"] if api_dict.get("prefix") else None,\n ),\n api_dict["connectionId"],\n )\n\n\ndef _remove_none_values(obj: Dict[str, Any]) -> Dict[str, Any]:\n return {k: v for k, v in obj.items() if v is not None}\n\n\ndef _dump_class(obj: Any) -> Dict[str, Any]:\n return json.loads(json.dumps(obj, default=lambda o: _remove_none_values(o.__dict__)))\n\n\nclass GeneratedAirbyteSource(AirbyteSource):\n """Base class used by the codegen Airbyte sources. This class is not intended to be used directly.\n\n Converts all of its attributes into a source configuration dict which is passed down to the base\n AirbyteSource class.\n """\n\n def __init__(self, source_type: str, name: str):\n source_configuration = _dump_class(self)\n super().__init__(\n name=name, source_type=source_type, source_configuration=source_configuration\n )\n\n\nclass GeneratedAirbyteDestination(AirbyteDestination):\n """Base class used by the codegen Airbyte destinations. This class is not intended to be used directly.\n\n Converts all of its attributes into a destination configuration dict which is passed down to the\n base AirbyteDestination class.\n """\n\n def __init__(self, source_type: str, name: str):\n destination_configuration = _dump_class(self)\n super().__init__(\n name=name,\n destination_type=source_type,\n destination_configuration=destination_configuration,\n )\n
", "current_page_name": "_modules/dagster_airbyte/managed/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.types"}}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.ops

\nfrom typing import Any, Iterable, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import _get_attempt, generate_materializations\n\nfrom .resources import DEFAULT_POLL_INTERVAL_SECONDS, BaseAirbyteResource\n\n\nclass AirbyteSyncConfig(Config):\n    connection_id: str = Field(\n        ...,\n        description=(\n            "Parsed json dictionary representing the details of the Airbyte connector after the"\n            " sync successfully completes. See the [Airbyte API"\n            " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n            " to see detailed information on this response."\n        ),\n    )\n    poll_interval: float = Field(\n        DEFAULT_POLL_INTERVAL_SECONDS,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    poll_timeout: Optional[float] = Field(\n        None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        True,\n        description=(\n            "If True, materializations corresponding to the results of the Airbyte sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        ["airbyte"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description=(\n "Parsed json dictionary representing the details of the Airbyte connector after the"\n " sync successfully completes. See the [Airbyte API"\n " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n " to see detailed information on this response."\n ),\n ),\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(\n context, config: AirbyteSyncConfig, airbyte: BaseAirbyteResource\n) -> Iterable[Any]:\n """Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n airbyte_output = airbyte.sync_and_poll(\n connection_id=config.connection_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(\n airbyte_output,\n metadata={\n **_get_attempt(airbyte_output.job_details.get("attempts", [{}])[-1]).get(\n "totalStats", {}\n )\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.resources

\nimport hashlib\nimport json\nimport logging\nimport sys\nimport time\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, List, Mapping, Optional, cast\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom dagster_airbyte.types import AirbyteOutput\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\nclass AirbyteResourceState:\n    def __init__(self) -> None:\n        self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}\n        # Int in case we nest contexts\n        self.cache_enabled = 0\n\n\nclass BaseAirbyteResource(ConfigurableResource):\n    request_max_retries: int = Field(\n        default=3,\n        description=(\n            "The maximum number of times requests to the Airbyte API should be retried "\n            "before failing."\n        ),\n    )\n    request_retry_delay: float = Field(\n        default=0.25,\n        description="Time (in seconds) to wait between each request retry.",\n    )\n    request_timeout: int = Field(\n        default=15,\n        description="Time (in seconds) after which the requests to Airbyte are declared timed out.",\n    )\n    cancel_sync_on_run_termination: bool = Field(\n        default=True,\n        description=(\n            "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"\n            " be useful to disable if using Airbyte sources that cannot be cancelled and"\n            " resumed easily, or if your Dagster deployment may experience runner interruptions"\n            " that do not impact your Airbyte deployment."\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL_SECONDS,\n        description="Time (in seconds) to wait between checking a sync's status.",\n    )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    @property\n    @cached_method\n    def _log(self) -> logging.Logger:\n        return get_dagster_logger()\n\n    @property\n    @abstractmethod\n    def api_base_url(self) -> str:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        raise NotImplementedError()\n\n    def make_request(\n        self, endpoint: str, data: Optional[Mapping[str, object]] = None, method: str = "POST"\n    ) -> Optional[Mapping[str, object]]:\n        """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n        Args:\n            endpoint (str): The Airbyte API endpoint to send this request to.\n            data (Optional[str]): JSON-formatted data string to be included in the request.\n\n        Returns:\n            Optional[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        url = self.api_base_url + endpoint\n        headers = {"accept": "application/json"}\n\n        num_retries = 0\n        while True:\n            try:\n                request_args: Dict[str, Any] = dict(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    timeout=self.request_timeout,\n                )\n                if data:\n                    request_args["json"] = data\n\n                request_args = deep_merge_dicts(\n                    request_args,\n                    self.all_additional_request_params,\n                )\n\n                response = requests.request(\n                    **request_args,\n                )\n                response.raise_for_status()\n                if response.status_code == 204:\n                    return None\n                return response.json()\n            except RequestException as e:\n                self._log.error("Request to Airbyte API failed: %s", e)\n                if num_retries == self.request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self.request_retry_delay)\n\n        raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n    @abstractmethod\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def cancel_job(self, job_id: int):\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def _should_forward_logs(self) -> bool:\n        raise NotImplementedError()\n\n    def sync_and_poll(\n        self,\n        connection_id: str,\n        poll_interval: Optional[float] = None,\n        poll_timeout: Optional[float] = None,\n    ) -> AirbyteOutput:\n        """Initializes a sync operation for the given connector, and polls until it completes.\n\n        Args:\n            connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n                "Connection" tab of a given connection in the Arbyte UI.\n            poll_interval (float): The time (in seconds) that will be waited between successive polls.\n            poll_timeout (float): The maximum time that will waited before this operation is timed\n                out. By default, this will never time out.\n\n        Returns:\n            :py:class:`~AirbyteOutput`:\n                Details of the sync job.\n        """\n        connection_details = self.get_connection_details(connection_id)\n        job_details = self.start_sync(connection_id)\n        job_info = cast(Dict[str, object], job_details.get("job", {}))\n        job_id = cast(int, job_info.get("id"))\n\n        self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n        start = time.monotonic()\n        logged_attempts = 0\n        logged_lines = 0\n        state = None\n\n        try:\n            while True:\n                if poll_timeout and start + poll_timeout < time.monotonic():\n                    raise Failure(\n                        f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n                        f" {poll_timeout} seconds"\n                    )\n                time.sleep(poll_interval or self.poll_interval)\n                job_details = self.get_job_status(connection_id, job_id)\n                attempts = cast(List, job_details.get("attempts", []))\n                cur_attempt = len(attempts)\n                # spit out the available Airbyte log info\n                if cur_attempt:\n                    if self._should_forward_logs:\n                        log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n                        for line in log_lines[logged_lines:]:\n                            sys.stdout.write(line + "\\n")\n                            sys.stdout.flush()\n                        logged_lines = len(log_lines)\n\n                    # if there's a next attempt, this one will have no more log messages\n                    if logged_attempts < cur_attempt - 1:\n                        logged_lines = 0\n                        logged_attempts += 1\n\n                job_info = cast(Dict[str, object], job_details.get("job", {}))\n                state = job_info.get("status")\n\n                if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n                    continue\n                elif state == AirbyteState.SUCCEEDED:\n                    break\n                elif state == AirbyteState.ERROR:\n                    raise Failure(f"Job failed: {job_id}")\n                elif state == AirbyteState.CANCELLED:\n                    raise Failure(f"Job was cancelled: {job_id}")\n                else:\n                    raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n        finally:\n            # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n            # the python process\n            if (\n                state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n                and self.cancel_sync_on_run_termination\n            ):\n                self.cancel_job(job_id)\n\n        return AirbyteOutput(job_details=job_details, connection_details=connection_details)\n\n\nclass AirbyteCloudResource(BaseAirbyteResource):\n    """This resource allows users to programatically interface with the Airbyte Cloud API to launch\n    syncs and monitor their progress.\n\n    **Examples:**\n\n    .. code-block:: python\n\n        from dagster import job, EnvVar\n        from dagster_airbyte import AirbyteResource\n\n        my_airbyte_resource = AirbyteCloudResource(\n            api_key=EnvVar("AIRBYTE_API_KEY"),\n        )\n\n        airbyte_assets = build_airbyte_assets(\n            connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n            destination_tables=["releases", "tags", "teams"],\n        )\n\n        defs = Definitions(\n            assets=[airbyte_assets],\n            resources={"airbyte": my_airbyte_resource},\n        )\n    """\n\n    api_key: str = Field(..., description="The Airbyte Cloud API key.")\n\n    @property\n    def api_base_url(self) -> str:\n        return "https://api.airbyte.com/v1"\n\n    @property\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        return {"headers": {"Authorization": f"Bearer {self.api_key}", "User-Agent": "dagster"}}\n\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        job_sync = check.not_none(\n            self.make_request(\n                endpoint="/jobs",\n                data={\n                    "connectionId": connection_id,\n                    "jobType": "sync",\n                },\n            )\n        )\n        return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}\n\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        return {}\n\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))\n        return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}\n\n    def cancel_job(self, job_id: int):\n        self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")\n\n    @property\n    def _should_forward_logs(self) -> bool:\n        # Airbyte Cloud does not support streaming logs yet\n        return False\n\n\n
[docs]class AirbyteResource(BaseAirbyteResource):\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job, EnvVar\n from dagster_airbyte import AirbyteResource\n\n my_airbyte_resource = AirbyteResource(\n host=EnvVar("AIRBYTE_HOST"),\n port=EnvVar("AIRBYTE_PORT"),\n # If using basic auth\n username=EnvVar("AIRBYTE_USERNAME"),\n password=EnvVar("AIRBYTE_PASSWORD"),\n )\n\n airbyte_assets = build_airbyte_assets(\n connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n destination_tables=["releases", "tags", "teams"],\n )\n\n defs = Definitions(\n assets=[airbyte_assets],\n resources={"airbyte": my_airbyte_resource},\n )\n """\n\n host: str = Field(description="The Airbyte server address.")\n port: str = Field(description="Port used for the Airbyte server.")\n username: Optional[str] = Field(default=None, description="Username if using basic auth.")\n password: Optional[str] = Field(default=None, description="Password if using basic auth.")\n use_https: bool = Field(\n default=False, description="Whether to use HTTPS to connect to the Airbyte server."\n )\n forward_logs: bool = Field(\n default=True,\n description=(\n "Whether to forward Airbyte logs to the compute log, can be expensive for"\n " long-running syncs."\n ),\n )\n request_additional_params: Mapping[str, Any] = Field(\n default=dict(),\n description=(\n "Any additional kwargs to pass to the requests library when making requests to Airbyte."\n ),\n )\n\n @property\n @cached_method\n def _state(self) -> AirbyteResourceState:\n return AirbyteResourceState()\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self.use_https else "http://")\n + (f"{self.host}:{self.port}" if self.port else self.host)\n + "/api/v1"\n )\n\n @property\n def _should_forward_logs(self) -> bool:\n return self.forward_logs\n\n @contextmanager\n def cache_requests(self):\n """Context manager that enables caching certain requests to the Airbyte API,\n cleared when the context is exited.\n """\n self.clear_request_cache()\n self._state.cache_enabled += 1\n try:\n yield\n finally:\n self.clear_request_cache()\n self._state.cache_enabled -= 1\n\n def clear_request_cache(self) -> None:\n self._state.request_cache = {}\n\n def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):\n if not self._state.cache_enabled > 0:\n return self.make_request(endpoint, data)\n data_json = json.dumps(data, sort_keys=True)\n sha = hashlib.sha1()\n sha.update(endpoint.encode("utf-8"))\n sha.update(data_json.encode("utf-8"))\n digest = sha.hexdigest()\n\n if digest not in self._state.request_cache:\n self._state.request_cache[digest] = self.make_request(endpoint, data)\n return self._state.request_cache[digest]\n\n @property\n def all_additional_request_params(self) -> Mapping[str, Any]:\n auth_param = (\n {"auth": (self.username, self.password)} if self.username and self.password else {}\n )\n return {**auth_param, **self.request_additional_params}\n\n def make_request(\n self, endpoint: str, data: Optional[Mapping[str, object]]\n ) -> Optional[Mapping[str, object]]:\n """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Optional[Dict[str, Any]]: Parsed json data from the response to this request\n """\n url = self.api_base_url + endpoint\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n **deep_merge_dicts( # type: ignore\n dict(\n method="POST",\n url=url,\n headers=headers,\n json=data,\n timeout=self.request_timeout,\n auth=(\n (self.username, self.password)\n if self.username and self.password\n else None\n ),\n ),\n self.request_additional_params,\n ),\n )\n response.raise_for_status()\n if response.status_code == 204:\n return None\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def cancel_job(self, job_id: int):\n self.make_request(endpoint="/jobs/cancel", data={"id": job_id})\n\n def get_default_workspace(self) -> str:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(\n "workspaces", []\n ),\n )\n return workspaces[0]["workspaceId"]\n\n def get_source_definition_by_name(self, name: str) -> Optional[str]:\n name_lower = name.lower()\n definitions = self.make_request_cached(endpoint="/source_definitions/list", data={})\n\n return next(\n (\n definition["sourceDefinitionId"]\n for definition in definitions["sourceDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_destination_definition_by_name(self, name: str):\n name_lower = name.lower()\n definitions = cast(\n Dict[str, List[Dict[str, str]]],\n check.not_none(\n self.make_request_cached(endpoint="/destination_definitions/list", data={})\n ),\n )\n return next(\n (\n definition["destinationDefinitionId"]\n for definition in definitions["destinationDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_source_catalog_id(self, source_id: str):\n result = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n return result["catalogId"]\n\n def get_source_schema(self, source_id: str) -> Mapping[str, Any]:\n return cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n\n def does_dest_support_normalization(\n self, destination_definition_id: str, workspace_id: str\n ) -> bool:\n # Airbyte API changed source of truth for normalization in PR\n # https://github.com/airbytehq/airbyte/pull/21005\n norm_dest_def_spec: bool = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definition_specifications/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n "workspaceId": workspace_id,\n },\n )\n ),\n ).get("supportsNormalization", False)\n\n norm_dest_def: bool = (\n cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definitions/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n },\n )\n ),\n )\n .get("normalizationConfig", {})\n .get("supported", False)\n )\n\n return any([norm_dest_def_spec, norm_dest_def])\n\n def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n if self.forward_logs:\n return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))\n else:\n # the "list all jobs" endpoint doesn't return logs, which actually makes it much more\n # lightweight for long-running syncs with many logs\n out = check.not_none(\n self.make_request(\n endpoint="/jobs/list",\n data={\n "configTypes": ["sync"],\n "configId": connection_id,\n # sync should be the most recent, so pageSize 5 is sufficient\n "pagination": {"pageSize": 5},\n },\n )\n )\n job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)\n\n return check.not_none(job)\n\n def start_sync(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n )\n\n def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n )\n\n def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: Optional[float] = None,\n poll_timeout: Optional[float] = None,\n ) -> AirbyteOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n job_id = cast(int, job_info.get("id"))\n\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n state = None\n\n try:\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n f" {poll_timeout} seconds"\n )\n time.sleep(poll_interval or self.poll_interval)\n job_details = self.get_job_status(connection_id, job_id)\n attempts = cast(List, job_details.get("attempts", []))\n cur_attempt = len(attempts)\n # spit out the available Airbyte log info\n if cur_attempt:\n if self.forward_logs:\n log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n state = job_info.get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n finally:\n # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n # the python process\n if (\n state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n and self.cancel_sync_on_run_termination\n ):\n self.cancel_job(job_id)\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=AirbyteResource.to_config_schema())\ndef airbyte_resource(context) -> AirbyteResource:\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n # If using basic auth\n "username": {"env": "AIRBYTE_USERNAME"},\n "password": {"env": "AIRBYTE_PASSWORD"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource.from_resource_context(context)
\n\n\n@dagster_maintained_resource\n@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))\ndef airbyte_cloud_resource(context) -> AirbyteCloudResource:\n """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch\n syncs and monitor their progress. Currently, this resource may only be used with the more basic\n `dagster-airbyte` APIs, including the ops and assets.\n\n """\n return AirbyteCloudResource.from_resource_context(context)\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_asset_factory

\nfrom typing import AbstractSet, List, Mapping, Optional, Set, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    AssetKey,\n    AssetsDefinition,\n    GraphDefinition,\n    OutputMapping,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster._core.definitions.graph_definition import create_adjacency_lists\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.utils import (\n    DagsterAirflowError,\n    normalized_name,\n)\n\n\ndef _build_asset_dependencies(\n    dag: DAG,\n    graph: GraphDefinition,\n    task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]],\n    upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]],\n) -> Tuple[AbstractSet[OutputMapping], Mapping[str, AssetKey], Mapping[str, Set[AssetKey]]]:\n    """Builds the asset dependency graph for a given set of airflow task mappings and a dagster graph."""\n    output_mappings = set()\n    keys_by_output_name = {}\n    internal_asset_deps: dict[str, Set[AssetKey]] = {}\n\n    visited_nodes: dict[str, bool] = {}\n    upstream_deps = set()\n\n    def find_upstream_dependency(node_name: str) -> None:\n        """Uses Depth-Firs-Search to find all upstream asset dependencies\n        as described in task_ids_by_asset_key.\n        """\n        # node has been visited\n        if visited_nodes[node_name]:\n            return\n        # mark node as visted\n        visited_nodes[node_name] = True\n        # traverse upstream nodes\n        for output_handle in graph.dependency_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = output_handle.node_name\n            match = False\n            # find any assets produced by upstream nodes and add them to the internal asset deps\n            for asset_key in task_ids_by_asset_key:\n                if (\n                    forward_node.replace(f"{normalized_name(dag.dag_id)}__", "")\n                    in task_ids_by_asset_key[asset_key]\n                ):\n                    upstream_deps.add(asset_key)\n                    match = True\n            # don't traverse past nodes that have assets\n            if not match:\n                find_upstream_dependency(forward_node)\n\n    # iterate through each asset to find all upstream asset dependencies\n    for asset_key in task_ids_by_asset_key:\n        asset_upstream_deps = set()\n        for task_id in task_ids_by_asset_key[asset_key]:\n            visited_nodes = {s.name: False for s in graph.nodes}\n            upstream_deps = set()\n            find_upstream_dependency(normalized_name(dag.dag_id, task_id))\n            for dep in upstream_deps:\n                asset_upstream_deps.add(dep)\n            keys_by_output_name[f"result_{normalized_name(dag.dag_id, task_id)}"] = asset_key\n            output_mappings.add(\n                OutputMapping(\n                    graph_output_name=f"result_{normalized_name(dag.dag_id, task_id)}",\n                    mapped_node_name=normalized_name(dag.dag_id, task_id),\n                    mapped_node_output_name="airflow_task_complete",  # Default output name\n                )\n            )\n\n        # the tasks for a given asset should have the same internal deps\n        for task_id in task_ids_by_asset_key[asset_key]:\n            if f"result_{normalized_name(dag.dag_id, task_id)}" in internal_asset_deps:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"].update(\n                    asset_upstream_deps\n                )\n            else:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"] = (\n                    asset_upstream_deps\n                )\n\n    # add new upstream asset dependencies to the internal deps\n    for asset_key in upstream_dependencies_by_asset_key:\n        for key in keys_by_output_name:\n            if keys_by_output_name[key] == asset_key:\n                internal_asset_deps[key].update(upstream_dependencies_by_asset_key[asset_key])\n\n    return (output_mappings, keys_by_output_name, internal_asset_deps)\n\n\n
[docs]def load_assets_from_airflow_dag(\n dag: DAG,\n task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]] = {},\n upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]] = {},\n connections: Optional[List[Connection]] = None,\n) -> List[AssetsDefinition]:\n """[Experimental] Construct Dagster Assets for a given Airflow DAG.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]): A mapping from asset\n keys to task ids. Used break up the Airflow Dag into multiple SDAs\n upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]): A\n mapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\n declare new upstream SDA depenencies.\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n List[AssetsDefinition]\n """\n cron_schedule = dag.normalized_schedule_interval\n if cron_schedule is not None and not is_valid_cron_schedule(str(cron_schedule)):\n raise DagsterAirflowError(f"Invalid cron schedule: {cron_schedule} in DAG {dag.dag_id}")\n\n job = make_dagster_job_from_airflow_dag(dag, connections=connections)\n graph = job._graph_def # noqa: SLF001\n start_date = dag.start_date if dag.start_date else dag.default_args.get("start_date")\n if start_date is None:\n raise DagsterAirflowError(f"Invalid start_date: {start_date} in DAG {dag.dag_id}")\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph.nodes, graph.dependency_structure)\n leaf_nodes = {\n node_name.replace(f"{normalized_name(dag.dag_id)}__", "")\n for node_name, downstream_nodes in forward_edges.items()\n if not downstream_nodes\n }\n\n mutated_task_ids_by_asset_key: dict[AssetKey, set[str]] = {}\n\n if task_ids_by_asset_key is None or task_ids_by_asset_key == {}:\n # if no mappings are provided the dag becomes a single SDA\n task_ids_by_asset_key = {AssetKey(dag.dag_id): leaf_nodes}\n else:\n # if mappings were provide any unmapped leaf nodes are added to a default asset\n used_nodes: set[str] = set()\n for key in task_ids_by_asset_key:\n used_nodes.update(task_ids_by_asset_key[key])\n\n mutated_task_ids_by_asset_key[AssetKey(dag.dag_id)] = leaf_nodes - used_nodes\n\n for key in task_ids_by_asset_key:\n if key not in mutated_task_ids_by_asset_key:\n mutated_task_ids_by_asset_key[key] = set(task_ids_by_asset_key[key])\n else:\n mutated_task_ids_by_asset_key[key].update(task_ids_by_asset_key[key])\n\n output_mappings, keys_by_output_name, internal_asset_deps = _build_asset_dependencies(\n dag, graph, mutated_task_ids_by_asset_key, upstream_dependencies_by_asset_key\n )\n\n new_graph = graph.copy(\n output_mappings=list(output_mappings),\n )\n\n asset_def = AssetsDefinition.from_graph(\n graph_def=new_graph,\n partitions_def=(\n TimeWindowPartitionsDefinition(\n cron_schedule=str(cron_schedule),\n timezone=dag.timezone.name,\n start=start_date.strftime("%Y-%m-%dT%H:%M:%S"),\n fmt="%Y-%m-%dT%H:%M:%S",\n )\n if cron_schedule is not None\n else None\n ),\n group_name=dag.dag_id,\n keys_by_output_name=keys_by_output_name,\n internal_asset_deps=internal_asset_deps,\n can_subset=True,\n )\n return [asset_def]
\n
", "current_page_name": "_modules/dagster_airflow/dagster_asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_asset_factory"}, "dagster_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_factory

\nimport os\nfrom typing import List, Mapping, Optional, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dagbag import DagBag\nfrom dagster import (\n    Definitions,\n    JobDefinition,\n    ResourceDefinition,\n    ScheduleDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.dagster_schedule_factory import (\n    _is_dag_is_schedule,\n    make_dagster_schedule_from_airflow_dag,\n)\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.resources.airflow_ephemeral_db import AirflowEphemeralDatabase\nfrom dagster_airflow.resources.airflow_persistent_db import AirflowPersistentDatabase\nfrom dagster_airflow.utils import (\n    is_airflow_2_loaded_in_environment,\n)\n\n\n
[docs]def make_dagster_definitions_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster definition corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_definition.py`:\n from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_definition_from_dag_bag():\n return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definition.py`\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n schedules, jobs = make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )\n\n return Definitions(\n schedules=schedules,\n jobs=jobs,\n resources=resource_defs,\n )
\n\n\n
[docs]def make_dagster_definitions_from_airflow_dags_path(\n dag_path: str,\n safe_mode: bool = True,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n Usage:\n Create ``make_dagster_definitions.py``:\n\n .. code-block:: python\n\n from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\n def make_definitions_from_dir():\n return make_dagster_definitions_from_airflow_dags_path(\n '/path/to/dags/',\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.str_param(dag_path, "dag_path")\n check.bool_param(safe_mode, "safe_mode")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n if (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowEphemeralDatabase"\n ):\n AirflowEphemeralDatabase._initialize_database(connections=connections) # noqa: SLF001\n elif (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowPersistentDatabase"\n ):\n AirflowPersistentDatabase._initialize_database( # noqa: SLF001\n uri=(\n os.getenv("AIRFLOW__DATABASE__SQL_ALCHEMY_CONN", "")\n if is_airflow_2_loaded_in_environment()\n else os.getenv("AIRFLOW__CORE__SQL_ALCHEMY_CONN", "")\n ),\n connections=connections,\n )\n\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n )\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )
\n\n\ndef make_dagster_definitions_from_airflow_example_dags(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Usage:\n\n Create `make_dagster_definitions.py`:\n from dagster_airflow import make_dagster_definitions_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_definitions_from_airflow_example_dags()\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definitions.py`\n\n Args:\n resource_defs: Optional[Mapping[str, ResourceDefinition]]\n Resource definitions to be used with the definitions\n\n Returns:\n Definitions\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is fixed in airflow v2\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag, resource_defs=resource_defs\n )\n\n\n
[docs]def make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Tuple[List[ScheduleDefinition], List[JobDefinition]]:\n """Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n - List[ScheduleDefinition]: The generated Dagster Schedules\n - List[JobDefinition]: The generated Dagster Jobs\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n job_defs = []\n schedule_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n dag = dag_bag.dags.get(dag_id)\n if not dag:\n continue\n if _is_dag_is_schedule(dag):\n schedule_defs.append(\n make_dagster_schedule_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n else:\n job_defs.append(\n make_dagster_job_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n\n count += 1\n\n return schedule_defs, job_defs
\n
", "current_page_name": "_modules/dagster_airflow/dagster_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_factory"}, "dagster_job_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom typing import List, Mapping, Optional\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    GraphDefinition,\n    JobDefinition,\n    ResourceDefinition,\n    _check as check,\n)\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import IS_AIRFLOW_INGEST_PIPELINE_STR\n\nfrom dagster_airflow.airflow_dag_converter import get_graph_definition_args\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.utils import (\n    normalized_name,\n)\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag: DAG,\n tags: Optional[Mapping[str, str]] = None,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> JobDefinition:\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagster UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n connections (List[Connection]): List of Airflow Connections to be created in the Ephemeral\n Airflow DB, if use_emphemeral_airflow_db is False this will be ignored.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_mapping_param(tags, "tags")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n mutated_tags = dict(tags)\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n mutated_tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n mutated_tags = validate_tags(mutated_tags)\n\n node_dependencies, node_defs = get_graph_definition_args(dag=dag)\n\n graph_def = GraphDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n node_defs=node_defs,\n dependencies=node_dependencies,\n tags=mutated_tags,\n )\n\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n job_def = JobDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n graph_def=graph_def,\n resource_defs=resource_defs,\n tags=mutated_tags,\n metadata={},\n op_retry_policy=None,\n version_strategy=None,\n )\n return job_def
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_job_factory"}, "operators": {"dagster_operator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.operators.dagster_operator

\nimport json\n\nfrom airflow.models import BaseOperator\nfrom airflow.utils.decorators import apply_defaults\n\nfrom dagster_airflow.hooks.dagster_hook import DagsterHook\nfrom dagster_airflow.links.dagster_link import LINK_FMT, DagsterLink\nfrom dagster_airflow.utils import is_airflow_2_loaded_in_environment\n\n\n
[docs]class DagsterOperator(BaseOperator):\n """DagsterOperator.\n\n Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """\n\n template_fields = ["run_config"]\n template_ext = (".yaml", ".yml", ".json")\n ui_color = "#663399"\n ui_fgcolor = "#e0e3fc"\n operator_extra_links = (DagsterLink(),)\n\n @apply_defaults\n def __init__(\n self,\n dagster_conn_id="dagster_default",\n run_config=None,\n repository_name="",\n repostitory_location_name="",\n job_name="",\n # params for airflow < 2.0.0 were custom connections aren't supported\n deployment_name="prod",\n user_token=None,\n organization_id="",\n url="https://dagster.cloud/",\n *args,\n **kwargs,\n ) -> None:\n super().__init__(*args, **kwargs)\n self.run_id = None\n self.dagster_conn_id = dagster_conn_id if is_airflow_2_loaded_in_environment() else None\n self.run_config = run_config or {}\n self.repository_name = repository_name\n self.repostitory_location_name = repostitory_location_name\n self.job_name = job_name\n\n self.user_token = user_token\n self.url = url\n self.organization_id = organization_id\n self.deployment_name = deployment_name\n\n self.hook = DagsterHook(\n dagster_conn_id=self.dagster_conn_id,\n user_token=self.user_token,\n url=f"{self.url}{self.organization_id}/{self.deployment_name}/graphql",\n )\n\n def _is_json(self, blob):\n try:\n json.loads(blob)\n except ValueError:\n return False\n return True\n\n def pre_execute(self, context):\n # force re-rendering to ensure run_config renders any templated\n # content from run_config that couldn't be accessed on init\n setattr(\n self,\n "run_config",\n self.render_template(self.run_config, context),\n )\n\n def on_kill(self):\n self.log.info("Terminating Run")\n self.hook.terminate_run(\n run_id=self.run_id,\n )\n\n def execute(self, context):\n try:\n return self._execute(context)\n except Exception as e:\n raise e\n\n def _execute(self, context):\n self.run_id = self.hook.launch_run(\n repository_name=self.repository_name,\n repostitory_location_name=self.repostitory_location_name,\n job_name=self.job_name,\n run_config=self.run_config,\n )\n # save relevant info in xcom for use in links\n context["task_instance"].xcom_push(key="run_id", value=self.run_id)\n context["task_instance"].xcom_push(\n key="organization_id",\n value=self.hook.organization_id if self.dagster_conn_id else self.organization_id,\n )\n context["task_instance"].xcom_push(\n key="deployment_name",\n value=self.hook.deployment_name if self.dagster_conn_id else self.deployment_name,\n )\n\n self.log.info("Run Starting....")\n self.log.info(\n "Run tracking: %s",\n LINK_FMT.format(\n organization_id=self.hook.organization_id,\n deployment_name=self.hook.deployment_name,\n run_id=self.run_id,\n ),\n )\n self.hook.wait_for_run(\n run_id=self.run_id,\n )
\n\n\n
[docs]class DagsterCloudOperator(DagsterOperator):\n """DagsterCloudOperator.\n\n Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """
\n
", "current_page_name": "_modules/dagster_airflow/operators/dagster_operator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.operators.dagster_operator"}}, "resources": {"airflow_ephemeral_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_ephemeral_db

\nimport importlib\nimport os\nimport tempfile\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom airflow.utils import db\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    Noneable,\n    ResourceDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    Locker,\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowEphemeralDatabase(AirflowDatabase):\n    """A ephemeral Airflow database Dagster resource."""\n\n    def __init__(\n        self, airflow_home_path: str, dagster_run: DagsterRun, dag_run_config: Optional[dict] = None\n    ):\n        self.airflow_home_path = airflow_home_path\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(\n        airflow_home_path: str = os.path.join(tempfile.gettempdir(), "dagster_airflow"),\n        connections: List[Connection] = [],\n    ):\n        os.environ["AIRFLOW_HOME"] = airflow_home_path\n        os.makedirs(airflow_home_path, exist_ok=True)\n        with Locker(airflow_home_path):\n            airflow_initialized = os.path.exists(f"{airflow_home_path}/airflow.db")\n            # because AIRFLOW_HOME has been overriden airflow needs to be reloaded\n            if is_airflow_2_loaded_in_environment():\n                importlib.reload(airflow.configuration)\n                importlib.reload(airflow.settings)\n                importlib.reload(airflow)\n            else:\n                importlib.reload(airflow)\n            if not airflow_initialized:\n                db.initdb()\n                create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowEphemeralDatabase":\n        airflow_home_path = os.path.join(tempfile.gettempdir(), f"dagster_airflow_{context.run_id}")\n        AirflowEphemeralDatabase._initialize_database(\n            airflow_home_path=airflow_home_path,\n            connections=[Connection(**c) for c in context.resource_config["connections"]],\n        )\n        return AirflowEphemeralDatabase(\n            airflow_home_path=airflow_home_path,\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            dag_run_config=context.resource_config.get("dag_run_config"),\n        )\n\n\n
[docs]def make_ephemeral_airflow_db_resource(\n connections: List[Connection] = [], dag_run_config: Optional[dict] = None\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an ephemeral Airflow database.\n\n Args:\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The ephemeral Airflow DB resource\n\n """\n serialized_connections = serialize_connections(connections)\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowEphemeralDatabase.from_resource_context,\n config_schema={\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n Noneable(dict),\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Ephemeral Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_ephemeral_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_ephemeral_db"}, "airflow_persistent_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_persistent_db

\nimport importlib\nimport os\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    ResourceDefinition,\n    StringSource,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowPersistentDatabase(AirflowDatabase):\n    """A persistent Airflow database Dagster resource."""\n\n    def __init__(self, dagster_run: DagsterRun, uri: str, dag_run_config: Optional[dict] = None):\n        self.uri = uri\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(uri: str, connections: List[Connection] = []):\n        if is_airflow_2_loaded_in_environment("2.3.0"):\n            os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow.configuration)\n            importlib.reload(airflow.settings)\n            importlib.reload(airflow)\n        else:\n            os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow)\n        create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowPersistentDatabase":\n        uri = context.resource_config["uri"]\n        AirflowPersistentDatabase._initialize_database(\n            uri=uri, connections=[Connection(**c) for c in context.resource_config["connections"]]\n        )\n        return AirflowPersistentDatabase(\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            uri=uri,\n            dag_run_config=context.resource_config["dag_run_config"],\n        )\n\n\n
[docs]def make_persistent_airflow_db_resource(\n uri: str = "",\n connections: List[Connection] = [],\n dag_run_config: Optional[dict] = {},\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an persistent Airflow database.\n\n\n Usage:\n .. code-block:: python\n\n from dagster_airflow import (\n make_dagster_definitions_from_airflow_dags_path,\n make_persistent_airflow_db_resource,\n )\n postgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\n airflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\n definitions = make_dagster_definitions_from_airflow_example_dags(\n '/path/to/dags/',\n resource_defs={"airflow_db": airflow_db}\n )\n\n\n Args:\n uri: SQLAlchemy URI of the Airflow DB to be used\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The persistent Airflow DB resource\n\n """\n if is_airflow_2_loaded_in_environment():\n os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n else:\n os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n\n serialized_connections = serialize_connections(connections)\n\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowPersistentDatabase.from_resource_context,\n config_schema={\n "uri": Field(\n StringSource,\n default_value=uri,\n is_required=False,\n ),\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n dict,\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Persistent Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_persistent_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_persistent_db"}}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.ecs.launcher

\nimport json\nimport logging\nimport os\nimport uuid\nimport warnings\nfrom collections import namedtuple\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Array,\n    DagsterRunStatus,\n    Field,\n    Noneable,\n    Permissive,\n    ScalarUnion,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import RUN_WORKER_ID_TAG\nfrom dagster._grpc.types import ExecuteRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.backoff import backoff\nfrom typing_extensions import Self\n\nfrom ..secretsmanager import get_secrets_from_arns\nfrom .container_context import SHARED_ECS_SCHEMA, SHARED_TASK_DEFINITION_FIELDS, EcsContainerContext\nfrom .tasks import (\n    DagsterEcsTaskDefinitionConfig,\n    get_current_ecs_task,\n    get_current_ecs_task_metadata,\n    get_task_definition_dict_from_current_task,\n    get_task_kwargs_from_current_task,\n)\nfrom .utils import get_task_definition_family, get_task_logs, task_definitions_match\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\nRUNNING_STATUSES = [\n    "PROVISIONING",\n    "PENDING",\n    "ACTIVATING",\n    "RUNNING",\n    "DEACTIVATING",\n    "STOPPING",\n    "DEPROVISIONING",\n]\nSTOPPED_STATUSES = ["STOPPED"]\n\nDEFAULT_WINDOWS_RESOURCES = {"cpu": "1024", "memory": "2048"}\n\nDEFAULT_LINUX_RESOURCES = {"cpu": "256", "memory": "512"}\n\n\n
[docs]class EcsRunLauncher(RunLauncher[T_DagsterInstance], ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n env_vars=None,\n include_sidecars=False,\n use_current_ecs_task_config: bool = True,\n run_task_kwargs: Optional[Mapping[str, Any]] = None,\n run_resources: Optional[Dict[str, Any]] = None,\n run_ecs_tags: Optional[List[Dict[str, Optional[str]]]] = None,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n self.logs = boto3.client("logs")\n\n self.task_definition = None\n self.task_definition_dict = {}\n if isinstance(task_definition, str):\n self.task_definition = task_definition\n elif task_definition and "env" in task_definition:\n check.invariant(\n len(task_definition) == 1,\n "If `task_definition` is set to a dictionary with `env`, `env` must be the only"\n " key.",\n )\n env_var = task_definition["env"]\n self.task_definition = os.getenv(env_var)\n if not self.task_definition:\n raise Exception(\n f"You have attempted to fetch the environment variable {env_var} which is not"\n " set."\n )\n else:\n self.task_definition_dict = task_definition or {}\n\n self.container_name = container_name\n\n self.secrets = check.opt_list_param(secrets, "secrets")\n\n self.env_vars = check.opt_list_param(env_vars, "env_vars")\n\n if self.secrets and all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = [\n {"name": name, "valueFrom": value_from}\n for name, value_from in get_secrets_from_arns(\n self.secrets_manager, self.secrets\n ).items()\n ]\n\n self.secrets_tags = [secrets_tag] if secrets_tag else []\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n self.use_current_ecs_task_config = check.opt_bool_param(\n use_current_ecs_task_config, "use_current_ecs_task_config"\n )\n\n self.run_task_kwargs = check.opt_mapping_param(run_task_kwargs, "run_task_kwargs")\n if run_task_kwargs:\n check.invariant(\n "taskDefinition" not in run_task_kwargs,\n "Use the `taskDefinition` config field to pass in a task definition to run.",\n )\n check.invariant(\n "overrides" not in run_task_kwargs,\n "Task overrides are set by the run launcher and cannot be set in run_task_kwargs.",\n )\n\n expected_keys = [\n key for key in self.ecs.meta.service_model.shape_for("RunTaskRequest").members\n ]\n\n for key in run_task_kwargs:\n check.invariant(\n key in expected_keys, f"Found an unexpected key {key} in run_task_kwargs"\n )\n\n self.run_resources = check.opt_mapping_param(run_resources, "run_resources")\n\n self.run_ecs_tags = check.opt_sequence_param(run_ecs_tags, "run_ecs_tags")\n\n self._current_task_metadata = None\n self._current_task = None\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def task_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("task_role_arn")\n\n @property\n def execution_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("execution_role_arn")\n\n @property\n def runtime_platform(self) -> Optional[Mapping[str, Any]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("runtime_platform")\n\n @property\n def mount_points(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("mount_points")\n\n @property\n def volumes(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("volumes")\n\n @property\n def repository_credentials(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("repository_credentials")\n\n @property\n def run_sidecar_containers(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("sidecar_containers")\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={\n "log_group": Field(StringSource, is_required=False),\n "sidecar_containers": Field(Array(Permissive({})), is_required=False),\n "requires_compatibilities": Field(Array(str), is_required=False),\n "env": Field(\n str,\n is_required=False,\n description=(\n "Backwards-compatibility for when task_definition was a"\n " StringSource.Can be used to source the task_definition scalar"\n " from an environment variable."\n ),\n ),\n **SHARED_TASK_DEFINITION_FIELDS,\n },\n ),\n is_required=False,\n description=(\n "Either the short name of an existing task definition to use when launching new"\n " tasks, or a dictionary configuration to use when creating a task definition"\n " for the run.If neither is provided, the task definition will be created based"\n " on the current task's task definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variables in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n "use_current_ecs_task_config": Field(\n bool,\n is_required=False,\n default_value=True,\n description=(\n "Whether to use the run launcher's current ECS task in order to determine "\n "the cluster and networking configuration for the launched task. Defaults to "\n "True. Should only be called if the run launcher is running within an ECS "\n "task."\n ),\n ),\n "run_task_kwargs": Field(\n Permissive(\n {\n "cluster": Field(\n StringSource,\n is_required=False,\n description="Name of the ECS cluster to launch ECS tasks in.",\n ),\n }\n ),\n is_required=False,\n description=(\n "Additional arguments to include while running the task. See"\n " https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task"\n " for the available parameters. The overrides and taskDefinition arguments will"\n " always be set by the run launcher."\n ),\n ),\n **SHARED_ECS_SCHEMA,\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_run_tags(self, run_id: str, cluster: str, task_arn: str):\n tags = {\n "ecs/task_arn": task_arn,\n "ecs/cluster": cluster,\n RUN_WORKER_ID_TAG: str(uuid.uuid4().hex)[0:6],\n }\n self._instance.add_run_tags(run_id, tags)\n\n def build_ecs_tags_for_run_task(self, run, container_context: EcsContainerContext):\n if any(tag["key"] == "dagster/run_id" for tag in container_context.run_ecs_tags):\n raise Exception("Cannot override system ECS tag: dagster/run_id")\n\n return [{"key": "dagster/run_id", "value": run.run_id}, *container_context.run_ecs_tags]\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def _get_command_args(self, run_args: ExecuteRunArgs, context: LaunchRunContext):\n return run_args.get_command_args()\n\n def _get_image_for_run(self, context: LaunchRunContext) -> Optional[str]:\n job_origin = check.not_none(context.job_code_origin)\n return job_origin.repository_origin.container_image\n\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run in an ECS task."""\n run = context.dagster_run\n container_context = EcsContainerContext.create_for_run(run, self)\n\n job_origin = check.not_none(context.job_code_origin)\n\n # ECS limits overrides to 8192 characters including json formatting\n # https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_RunTask.html\n # When container_context is serialized as part of the ExecuteRunArgs, we risk\n # going over this limit (for example, if many secrets have been set). This strips\n # the container context off of our job origin because we don't actually need\n # it to launch the run; we only needed it to create the task definition.\n repository_origin = job_origin.repository_origin\n\n stripped_repository_origin = repository_origin._replace(container_context={})\n stripped_job_origin = job_origin._replace(repository_origin=stripped_repository_origin)\n\n args = ExecuteRunArgs(\n job_origin=stripped_job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = self._get_command_args(args, context)\n image = self._get_image_for_run(context)\n\n run_task_kwargs = self._run_task_kwargs(run, image, container_context)\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = self.get_cpu_and_memory_overrides(container_context, run)\n\n task_overrides = self._get_task_overrides(container_context, run)\n\n container_overrides: List[Dict[str, Any]] = [\n {\n "name": self._get_container_name(container_context),\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ]\n\n run_task_kwargs["overrides"] = {\n "containerOverrides": container_overrides,\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n **task_overrides,\n }\n run_task_kwargs["tags"] = [\n *run_task_kwargs.get("tags", []),\n *self.build_ecs_tags_for_run_task(run, container_context),\n ]\n\n run_task_kwargs_from_run = self._get_run_task_kwargs_from_run(run)\n run_task_kwargs.update(run_task_kwargs_from_run)\n\n # launchType and capacityProviderStrategy are incompatible - prefer the latter if it is set\n if "launchType" in run_task_kwargs and run_task_kwargs.get("capacityProviderStrategy"):\n del run_task_kwargs["launchType"]\n\n # Run a task using the same network configuration as this processes's task.\n response = self.ecs.run_task(**run_task_kwargs)\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n failure_messages = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n\n failure_message = (\n "Task"\n + (f" {arn}" if arn else "")\n + " failed."\n + (f" Failure reason: {reason}" if reason else "")\n + (f" Failure details: {detail}" if detail else "")\n )\n failure_messages.append(failure_message)\n\n raise Exception("\\n".join(failure_messages) if failure_messages else "Task failed.")\n\n arn = tasks[0]["taskArn"]\n cluster_arn = tasks[0]["clusterArn"]\n self._set_run_tags(run.run_id, cluster=cluster_arn, task_arn=arn)\n self.report_launch_events(run, arn, cluster_arn)\n\n def report_launch_events(\n self, run: DagsterRun, arn: Optional[str] = None, cluster: Optional[str] = None\n ):\n # Extracted method to allow for subclasses to customize the launch reporting behavior\n\n metadata = {}\n if arn:\n metadata["ECS Task ARN"] = arn\n if cluster:\n metadata["ECS Cluster"] = cluster\n\n metadata["Run ID"] = run.run_id\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n dagster_run=run,\n engine_event_data=EngineEventData(metadata),\n cls=self.__class__,\n )\n\n def get_cpu_and_memory_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, str]:\n overrides = {}\n\n cpu = run.tags.get("ecs/cpu", container_context.run_resources.get("cpu"))\n memory = run.tags.get("ecs/memory", container_context.run_resources.get("memory"))\n\n if cpu:\n overrides["cpu"] = cpu\n if memory:\n overrides["memory"] = memory\n\n return overrides\n\n def _get_task_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, Any]:\n tag_overrides = run.tags.get("ecs/task_overrides")\n\n overrides = {}\n\n if tag_overrides:\n overrides = json.loads(tag_overrides)\n\n ephemeral_storage = run.tags.get(\n "ecs/ephemeral_storage", container_context.run_resources.get("ephemeral_storage")\n )\n if ephemeral_storage:\n overrides["ephemeralStorage"] = {"sizeInGiB": int(ephemeral_storage)}\n\n return overrides\n\n def _get_run_task_kwargs_from_run(self, run: DagsterRun) -> Mapping[str, Any]:\n run_task_kwargs = run.tags.get("ecs/run_task_kwargs")\n if run_task_kwargs:\n return json.loads(run_task_kwargs)\n return {}\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _get_current_task_metadata(self):\n if self._current_task_metadata is None:\n self._current_task_metadata = get_current_ecs_task_metadata()\n return self._current_task_metadata\n\n def _get_current_task(self):\n if self._current_task is None:\n current_task_metadata = self._get_current_task_metadata()\n self._current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n\n return self._current_task\n\n def _get_run_task_definition_family(self, run: DagsterRun) -> str:\n return get_task_definition_family("run", check.not_none(run.external_job_origin))\n\n def _get_container_name(self, container_context) -> str:\n return container_context.container_name or self.container_name\n\n def _run_task_kwargs(self, run, image, container_context) -> Dict[str, Any]:\n """Return a dictionary of args to launch the ECS task, registering a new task\n definition if needed.\n """\n environment = self._environment(container_context)\n environment.append({"name": "DAGSTER_RUN_JOB_NAME", "value": run.job_name})\n\n secrets = self._secrets(container_context)\n\n if container_context.task_definition_arn:\n task_definition = container_context.task_definition_arn\n else:\n family = self._get_run_task_definition_family(run)\n\n if self.task_definition_dict or not self.use_current_ecs_task_config:\n runtime_platform = container_context.runtime_platform\n is_windows = container_context.runtime_platform.get(\n "operatingSystemFamily"\n ) not in {None, "LINUX"}\n\n default_resources = (\n DEFAULT_WINDOWS_RESOURCES if is_windows else DEFAULT_LINUX_RESOURCES\n )\n task_definition_config = DagsterEcsTaskDefinitionConfig(\n family,\n image,\n self._get_container_name(container_context),\n command=None,\n log_configuration=(\n {\n "logDriver": "awslogs",\n "options": {\n "awslogs-group": self.task_definition_dict["log_group"],\n "awslogs-region": self.ecs.meta.region_name,\n "awslogs-stream-prefix": family,\n },\n }\n if self.task_definition_dict.get("log_group")\n else None\n ),\n secrets=secrets if secrets else [],\n environment=environment,\n execution_role_arn=container_context.execution_role_arn,\n task_role_arn=container_context.task_role_arn,\n sidecars=container_context.run_sidecar_containers,\n requires_compatibilities=self.task_definition_dict.get(\n "requires_compatibilities", []\n ),\n cpu=container_context.run_resources.get("cpu", default_resources["cpu"]),\n memory=container_context.run_resources.get(\n "memory", default_resources["memory"]\n ),\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n runtime_platform=runtime_platform,\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n repository_credentials=container_context.repository_credentials,\n )\n task_definition_dict = task_definition_config.task_definition_dict()\n else:\n task_definition_dict = get_task_definition_dict_from_current_task(\n self.ecs,\n family,\n self._get_current_task(),\n image,\n self._get_container_name(container_context),\n environment=environment,\n secrets=secrets if secrets else {},\n include_sidecars=self.include_sidecars,\n task_role_arn=container_context.task_role_arn,\n execution_role_arn=container_context.execution_role_arn,\n cpu=container_context.run_resources.get("cpu"),\n memory=container_context.run_resources.get("memory"),\n runtime_platform=container_context.runtime_platform,\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n additional_sidecars=container_context.run_sidecar_containers,\n repository_credentials=container_context.repository_credentials,\n )\n\n task_definition_config = DagsterEcsTaskDefinitionConfig.from_task_definition_dict(\n task_definition_dict,\n self._get_container_name(container_context),\n )\n\n container_name = self._get_container_name(container_context)\n\n backoff(\n self._reuse_or_register_task_definition,\n retry_on=(Exception,),\n kwargs={\n "desired_task_definition_config": task_definition_config,\n "container_name": container_name,\n "task_definition_dict": task_definition_dict,\n },\n max_retries=5,\n )\n\n task_definition = family\n\n if self.use_current_ecs_task_config:\n current_task_metadata = get_current_ecs_task_metadata()\n current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n task_kwargs = get_task_kwargs_from_current_task(\n self.ec2,\n current_task_metadata.cluster,\n current_task,\n )\n else:\n task_kwargs = {}\n\n return {**task_kwargs, **self.run_task_kwargs, "taskDefinition": task_definition}\n\n def _reuse_task_definition(\n self, desired_task_definition_config: DagsterEcsTaskDefinitionConfig, container_name: str\n ):\n family = desired_task_definition_config.family\n\n try:\n existing_task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n except ClientError:\n # task definition does not exist, do not reuse\n return False\n\n return task_definitions_match(\n desired_task_definition_config,\n existing_task_definition,\n container_name=container_name,\n )\n\n def _reuse_or_register_task_definition(\n self,\n desired_task_definition_config: DagsterEcsTaskDefinitionConfig,\n container_name: str,\n task_definition_dict: dict,\n ):\n if not self._reuse_task_definition(desired_task_definition_config, container_name):\n self.ecs.register_task_definition(**task_definition_dict)\n\n def _environment(self, container_context):\n return [\n {"name": key, "value": value}\n for key, value in container_context.get_environment_dict().items()\n ]\n\n def _secrets(self, container_context):\n secrets = container_context.get_secrets_dict(self.secrets_manager)\n return (\n [{"name": key, "valueFrom": value} for key, value in secrets.items()] if secrets else []\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def include_cluster_info_in_failure_messages(self):\n return True\n\n def _is_transient_startup_failure(self, run, task):\n if not task.get("stoppedReason"):\n return False\n return (\n run.status == DagsterRunStatus.STARTING\n and "Timeout waiting for network interface provisioning to complete"\n in task.get("stoppedReason")\n )\n\n def check_run_worker_health(self, run: DagsterRun):\n run_worker_id = run.tags.get(RUN_WORKER_ID_TAG)\n\n tags = self._get_run_tags(run.run_id)\n container_context = EcsContainerContext.create_for_run(run, self)\n\n if not (tags.arn and tags.cluster):\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n t = tasks[0]\n\n if t.get("lastStatus") in RUNNING_STATUSES:\n return CheckRunHealthResult(WorkerStatus.RUNNING, run_worker_id=run_worker_id)\n elif t.get("lastStatus") in STOPPED_STATUSES:\n failed_containers = []\n for c in t.get("containers"):\n if c.get("exitCode") != 0:\n failed_containers.append(c)\n if len(failed_containers) > 0:\n if len(failed_containers) > 1:\n container_str = "Containers"\n else:\n container_str = "Container"\n\n failure_text = []\n\n cluster_failure_info = (\n f"Task {t.get('taskArn')} failed. Stop code: {t.get('stopCode')}. Stop"\n + f" reason: {t.get('stoppedReason')}."\n + f" {container_str} {[c.get('name') for c in failed_containers]} failed."\n )\n\n logging.warning(\n "Run monitoring detected run worker failure: " + cluster_failure_info\n )\n\n if self.include_cluster_info_in_failure_messages:\n failure_text.append(cluster_failure_info)\n\n logs = []\n\n try:\n logs = get_task_logs(\n self.ecs,\n logs_client=self.logs,\n cluster=tags.cluster,\n task_arn=tags.arn,\n container_name=self._get_container_name(container_context),\n )\n except:\n logging.exception(f"Error trying to get logs for failed task {tags.arn}")\n\n if logs:\n failure_text.append("Run worker logs:\\n" + "\\n".join(logs))\n\n return CheckRunHealthResult(\n WorkerStatus.FAILED,\n "\\n\\n".join(failure_text),\n transient=self._is_transient_startup_failure(run, t),\n run_worker_id=run_worker_id,\n )\n\n return CheckRunHealthResult(WorkerStatus.SUCCESS, run_worker_id=run_worker_id)\n\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, "ECS task health status is unknown.", run_worker_id=run_worker_id\n )
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nimport dagster\nimport dagster._check as check\nfrom botocore.exceptions import WaiterError\n\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(f"cluster {cluster_name} not found in region {self.region}")\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns:\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = f"{log_key_prefix}{cluster_id}/steps/{step_id}"\n stdout_log = self.wait_for_log(log, log_bucket, f"{prefix}/stdout.gz")\n stderr_log = self.wait_for_log(log, log_bucket, f"{prefix}/stderr.gz")\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(f"Attempting to get log: s3://{log_bucket}/{log_key}")\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._serdes import deserialize_value\n\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description=(\n "The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html"\n ),\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description=(\n "S3 bucket to use for passing files between the plan process and EMR process."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description=(\n "S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process"\n ),\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime."\n ),\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to the package that contains the job definition(s) whose steps will"\n " execute remotely on EMR. This is a path on the local fileystem of the process"\n " executing the job. The expectation is that this package will also be available on"\n " the python path of the launched process running the Spark step on EMR, either"\n " deployed on step launch via the deploy_local_job_package option, referenced on s3"\n " via the s3_job_package_path option, or installed on the cluster via bootstrap"\n " actions."\n ),\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "(legacy) Absolute path to the package that contains the pipeline definition(s)"\n " whose steps will execute remotely on EMR. This is a path on the local fileystem"\n " of the process executing the pipeline. The expectation is that this package will"\n " also be available on the python path of the launched process running the Spark"\n " step on EMR, either deployed on step launch via the deploy_local_pipeline_package"\n " option, referenced on s3 via the s3_pipeline_package_path option, or installed on"\n " the cluster via bootstrap actions."\n ),\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "(legacy) If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True."\n ),\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True."\n ),\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_pipeline_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items() # type: ignore\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(f"Uploading file {local_path} to {s3_uri}")\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context):\n step_run_ref = step_context_to_step_run_ref(step_context, self.local_job_package_path)\n\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.op.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(step_context, s3, run_id, step_key, emr_step_id):\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n\n def wait_for_completion(\n self, step_context, s3, run_id, step_key, emr_step_id, check_interval=15\n ):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(\n step_context.log, self.cluster_id, emr_step_id\n )\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n event = all_events_new[i]\n # write each event from the EMR instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object(\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return deserialize_value(pickle.loads(events_data))\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log.\n """\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc=(\n "spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"'\n )\n % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, self._sanitize_step_key(step_key), filename)\n return f"s3://{self.staging_bucket}/{key}"\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n """Cluster state for EMR."""\n\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n """Step state for EMR."""\n\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\nfrom logging import Logger\nfrom typing import Any, Dict, Optional, cast\n\nimport psycopg2\nimport psycopg2.extensions\nfrom dagster import (\n    ConfigurableResource,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass BaseRedshiftClient(abc.ABC):\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftClient(BaseRedshiftClient):\n    def __init__(self, conn_args: Dict[str, Any], autocommit: Optional[bool], log: Logger):\n        # Extract parameters from resource config\n        self.conn_args = conn_args\n\n        self.autocommit = autocommit\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info(f"Executing query '{query}'")\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info(f"Executing query '{query}'")\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use RedshiftClientResource instead.")\nclass RedshiftResource(RedshiftClient):\n    """This class was used by the function-style Redshift resource."""\n\n\nclass FakeRedshiftClient(BaseRedshiftClient):\n    QUERY_RESULT = [(1,)]\n\n    def __init__(self, log: Logger):\n        # Extract parameters from resource config\n\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3.\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use FakeRedshiftClientResource instead.")\nclass FakeRedshiftResource(FakeRedshiftClient):\n    """This class was used by the function-style fake Redshift resource."""\n\n\n
[docs]class RedshiftClientResource(ConfigurableResource):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import Definitions, asset, EnvVar\n from dagster_aws.redshift import RedshiftClientResource\n\n @asset\n def example_redshift_asset(context, redshift: RedshiftClientResource):\n redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = RedshiftClientResource(\n host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n port=5439,\n user='dagster',\n password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n database='dev',\n )\n\n defs = Definitions(\n assets=[example_redshift_asset],\n resources={'redshift': redshift_configured},\n )\n\n """\n\n host: str = Field(description="Redshift host")\n port: int = Field(default=5439, description="Redshift port")\n user: Optional[str] = Field(default=None, description="Username for Redshift connection")\n password: Optional[str] = Field(default=None, description="Password for Redshift connection")\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use USE DATABASE to change"\n " the database."\n ),\n )\n autocommit: Optional[bool] = Field(default=None, description="Whether to autocommit queries")\n connect_timeout: int = Field(\n default=5, description="Timeout for connection to Redshift cluster. Defaults to 5 seconds."\n )\n sslmode: str = Field(\n default="require",\n description=(\n "SSL mode to use. See the Redshift documentation for reference:"\n " https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> RedshiftClient:\n conn_args = {\n k: getattr(self, k, None)\n for k in (\n "host",\n "port",\n "user",\n "password",\n "database",\n "connect_timeout",\n "sslmode",\n )\n if getattr(self, k, None) is not None\n }\n\n return RedshiftClient(conn_args, self.autocommit, get_dagster_logger())
\n\n\n
[docs]class FakeRedshiftClientResource(RedshiftClientResource):\n def get_client(self) -> FakeRedshiftClient:\n return FakeRedshiftClient(get_dagster_logger())
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=RedshiftClientResource.to_config_schema(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context) -> RedshiftClient:\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftClientResource.from_resource_context(context).get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=FakeRedshiftClientResource.to_config_schema(),\n description=(\n "Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case."\n ),\n)\ndef fake_redshift_resource(context) -> FakeRedshiftClient:\n return cast(\n FakeRedshiftClient,\n FakeRedshiftClientResource.from_resource_context(context).get_client(),\n )
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence\n\nimport boto3\nimport dagster._seven as seven\nfrom botocore.errorfactory import ClientError\nfrom dagster import (\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nPOLLING_INTERVAL = 5\n\n\n
[docs]class S3ComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n upload_interval: 30\n upload_extra_args:\n ServerSideEncryption: "AES256"\n show_url_only: false\n region: "us-west-1"\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.\n upload_extra_args: (Optional[dict]): Extra args for S3 file upload\n show_url_only: (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.\n region: (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n upload_interval=None,\n upload_extra_args=None,\n show_url_only=False,\n region=None,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n check.opt_dict_param(upload_extra_args, "upload_extra_args")\n self._upload_extra_args = upload_extra_args\n self._show_url_only = show_url_only\n if region is None:\n # if unspecified, use the current session name\n self._region = self._s3_session.meta.region_name\n else:\n self._region = region\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n "upload_extra_args": Field(\n Permissive(), is_required=False, description="Extra args for S3 file upload"\n ),\n "show_url_only": Field(bool, is_required=False, default_value=False),\n "region": Field(StringSource, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _s3_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._s3_prefix, "storage", *namespace, filename]\n return "/".join(paths) # s3 path delimiter\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Iterator[CapturedLogContext]:\n with super().capture_logs(log_key) as local_context:\n if not self._show_url_only:\n yield local_context\n else:\n out_key = self._s3_key(log_key, ComputeIOType.STDOUT)\n err_key = self._s3_key(log_key, ComputeIOType.STDERR)\n s3_base = f"https://s3.console.aws.amazon.com/s3/object/{self._s3_bucket}?region={self._region}"\n yield CapturedLogContext(\n local_context.log_key,\n external_stdout_url=f"{s3_base}&prefix={out_key}",\n external_stderr_url=f"{s3_base}&prefix={err_key}",\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n\n s3_keys_to_remove = None\n if log_key:\n s3_keys_to_remove = [\n self._s3_key(log_key, ComputeIOType.STDOUT),\n self._s3_key(log_key, ComputeIOType.STDERR),\n self._s3_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._s3_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n elif prefix:\n # add the trailing '' to make sure that ['a'] does not match ['apple']\n s3_prefix = "/".join([self._s3_prefix, "storage", *prefix, ""])\n matching = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=s3_prefix)\n s3_keys_to_remove = [obj["Key"] for obj in matching.get("Contents", [])]\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if s3_keys_to_remove:\n to_delete = [{"Key": key} for key in s3_keys_to_remove]\n self._s3_session.delete_objects(Bucket=self._s3_bucket, Delete={"Objects": to_delete})\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n s3_key = self._s3_key(log_key, io_type)\n return self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": s3_key}\n )\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n s3_key = self._s3_key(log_key, io_type)\n return f"s3://{self._s3_bucket}/{s3_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(Bucket=self._s3_bucket, Key=s3_key)\n except ClientError:\n return False\n return True\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if (self._skip_empty_files or partial) and os.stat(path).st_size == 0:\n return\n\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n extra_args = {\n "ContentType": "text/plain",\n **(self._upload_extra_args if self._upload_extra_args else {}),\n }\n self._s3_session.upload_fileobj(data, self._s3_bucket, s3_key, ExtraArgs=extra_args)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self._local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(self._s3_bucket, s3_key, fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return f"s3://{self.s3_bucket}/{self.s3_key}"
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return f"{self._s3_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\nfrom typing import Any, Dict, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    MetadataValue,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import S3Resource\n\n\nclass PickledObjectS3IOManager(UPathIOManager):\n    def __init__(\n        self,\n        s3_bucket: str,\n        s3_session: Any,\n        s3_prefix: Optional[str] = None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        check.opt_str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.list_objects(Bucket=s3_bucket, Prefix=s3_prefix, MaxKeys=1)\n        base_path = UPath(s3_prefix) if s3_prefix else None\n        super().__init__(base_path=base_path)\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        try:\n            s3_obj = self.s3.get_object(Bucket=self.bucket, Key=str(path))["Body"].read()\n            return pickle.loads(s3_obj)\n        except self.s3.exceptions.NoSuchKey:\n            raise FileNotFoundError(f"Could not find file {path} in S3 bucket {self.bucket}")\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing S3 object: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, str(path))\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=str(path))\n        except self.s3.exceptions.NoSuchKey:\n            return False\n        return True\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading S3 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing S3 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        self.s3.delete_object(Bucket=self.bucket, Key=str(path))\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in S3\n        return None\n\n    def get_metadata(self, context: OutputContext, obj: Any) -> Dict[str, MetadataValue]:\n        path = self._get_path(context)\n        return {"uri": MetadataValue.path(self._uri_for_path(path))}\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        return UPath("storage", super().get_op_output_relative_path(context))\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"s3://{self.bucket}/{path}"\n\n\n
[docs]class S3PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": S3PickleIOManager(\n s3_resource=S3Resource(),\n s3_bucket="my-cool-bucket",\n s3_prefix="my-cool-prefix",\n )\n }\n )\n\n """\n\n s3_resource: ResourceDependency[S3Resource]\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @cached_method\n def inner_io_manager(self) -> PickledObjectS3IOManager:\n return PickledObjectS3IOManager(\n s3_bucket=self.s3_bucket,\n s3_session=self.s3_resource.get_client(),\n s3_prefix=self.s3_prefix,\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self.inner_io_manager().load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n return self.inner_io_manager().handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use S3PickleIOManager instead.",\n)\nclass ConfigurablePickledObjectS3IOManager(S3PickleIOManager):\n """Renamed to S3PickleIOManager. See S3PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=S3PickleIOManager.to_config_schema(),\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n @job(\n resource_defs={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n def my_job():\n ...\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.ops

\nfrom typing import Any, Generator, Mapping\n\nfrom dagster import (\n    AssetMaterialization,\n    Field,\n    FileHandle,\n    In,\n    MetadataValue,\n    Out,\n    Output,\n    StringSource,\n    _check as check,\n    dagster_type_loader,\n    op,\n)\nfrom dagster._core.types.dagster_type import PythonObjectDagsterType\n\nfrom .file_manager import S3FileHandle\n\n\ndef dict_with_fields(name: str, fields: Mapping[str, object]):\n    check.str_param(name, "name")\n    check.mapping_param(fields, "fields", key_type=str)\n    field_names = set(fields.keys())\n\n    @dagster_type_loader(fields)\n    def _input_schema(_context, value):\n        check.dict_param(value, "value")\n        check.param_invariant(set(value.keys()) == field_names, "value")\n        return value\n\n    class _DictWithSchema(PythonObjectDagsterType):\n        def __init__(self):\n            super(_DictWithSchema, self).__init__(python_type=dict, name=name, loader=_input_schema)\n\n    return _DictWithSchema()\n\n\nS3Coordinate = dict_with_fields(\n    "S3Coordinate",\n    fields={\n        "bucket": Field(StringSource, description="S3 bucket name"),\n        "key": Field(StringSource, description="S3 key name"),\n    },\n)\n\n\ndef last_key(key: str) -> str:\n    if "/" not in key:\n        return key\n    comps = key.split("/")\n    return comps[-1]\n\n\n@op(\n    config_schema={\n        "Bucket": Field(\n            StringSource, description="The name of the bucket to upload to.", is_required=True\n        ),\n        "Key": Field(\n            StringSource, description="The name of the key to upload to.", is_required=True\n        ),\n    },\n    ins={"file_handle": In(FileHandle, description="The file to upload.")},\n    out={"s3_file_handle": Out(S3FileHandle)},\n    description="""Take a file handle and upload it to s3. Returns an S3FileHandle.""",\n    required_resource_keys={"s3", "file_manager"},\n)\ndef file_handle_to_s3(context, file_handle) -> Generator[Any, None, None]:\n    bucket = context.op_config["Bucket"]\n    key = context.op_config["Key"]\n\n    file_manager = context.resources.file_manager\n    s3 = context.resources.s3\n\n    with file_manager.read(file_handle, "rb") as fileobj:\n        s3.upload_fileobj(fileobj, bucket, key)\n        s3_file_handle = S3FileHandle(bucket, key)\n\n        yield AssetMaterialization(\n            asset_key=s3_file_handle.s3_path,\n            metadata={last_key(key): MetadataValue.path(s3_file_handle.s3_path)},\n        )\n\n        yield Output(value=s3_file_handle, output_name="s3_file_handle")\n
", "current_page_name": "_modules/dagster_aws/s3/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.resources

\nfrom typing import Any, Optional, TypeVar\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nT = TypeVar("T")\n\n\nclass ResourceWithS3Configuration(ConfigurableResource):\n    use_unsigned_session: bool = Field(\n        default=False, description="Specifies whether to use an unsigned S3 session."\n    )\n    region_name: Optional[str] = Field(\n        default=None, description="Specifies a custom region for the S3 session."\n    )\n    endpoint_url: Optional[str] = Field(\n        default=None, description="Specifies a custom endpoint for the S3 session."\n    )\n    max_attempts: int = Field(\n        default=5,\n        description=(\n            "This provides Boto3's retry handler with a value of maximum retry attempts, where the"\n            " initial call counts toward the max_attempts value that you provide."\n        ),\n    )\n    profile_name: Optional[str] = Field(\n        default=None, description="Specifies a profile to connect that session."\n    )\n    use_ssl: bool = Field(\n        default=True, description="Whether or not to use SSL. By default, SSL is used."\n    )\n    verify: Optional[str] = Field(\n        default=None,\n        description=(\n            "Whether or not to verify SSL certificates. By default SSL certificates are verified."\n            " You can also specify this argument if you want to use a different CA cert bundle than"\n            " the one used by botocore."\n        ),\n    )\n    aws_access_key_id: Optional[str] = Field(\n        default=None, description="AWS access key ID to use when creating the boto3 session."\n    )\n    aws_secret_access_key: Optional[str] = Field(\n        default=None, description="AWS secret access key to use when creating the boto3 session."\n    )\n    aws_session_token: str = Field(\n        default=None, description="AWS session token to use when creating the boto3 session."\n    )\n\n\n
[docs]class S3Resource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op, Definitions\n from dagster_aws.s3 import S3Resource\n\n @op\n def example_s3_op(s3: S3Resource):\n return s3.get_client().list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job\n def example_job():\n example_s3_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={'s3': S3Resource(region_name='us-west-1')}\n )\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> Any:\n return construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=S3Resource.to_config_schema())\ndef s3_resource(context) -> Any:\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job():\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n use_ssl: true\n # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n verify: None\n # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n aws_access_key_id: None\n # Optional[str]: The access key to use when creating the client.\n aws_secret_access_key: None\n # Optional[str]: The secret key to use when creating the client.\n aws_session_token: None\n # Optional[str]: The session token to use when creating the client.\n """\n return S3Resource.from_resource_context(context).get_client()
\n\n\n
[docs]class S3FileManagerResource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n def get_client(self) -> S3FileManager:\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n ),\n s3_bucket=self.s3_bucket,\n s3_base_key=self.s3_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=S3FileManagerResource.to_config_schema(),\n)\ndef s3_file_manager(context) -> S3FileManager:\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return S3FileManagerResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Dict, Generator, List, Optional, cast\n\nfrom dagster import (\n    Field as LegacyDagsterField,\n    resource,\n)\nfrom dagster._config.field_utils import Shape\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.test_utils import environ\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom dagster_aws.utils import ResourceWithBoto3Configuration\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nif TYPE_CHECKING:\n    import botocore\n\n\n
[docs]class SecretsManagerResource(ResourceWithBoto3Configuration):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import SecretsManagerResource\n\n @op\n def example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n return secretsmanager.get_client().get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job\n def example_job():\n example_secretsmanager_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secretsmanager': SecretsManagerResource(\n region_name='us-west-1'\n )\n }\n )\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> "botocore.client.SecretsManager":\n return construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(SecretsManagerResource.to_config_schema())\ndef secretsmanager_resource(context) -> "botocore.client.SecretsManager":\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job():\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return SecretsManagerResource.from_resource_context(context).get_client()
\n\n\n
[docs]class SecretsManagerSecretsResource(ResourceWithBoto3Configuration):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op, ResourceParam\n from dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n @op\n def example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n return secrets.fetch_secrets().get("my-secret-name")\n\n @op\n def example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n with secrets.secrets_in_environment():\n return os.getenv("my-other-secret-name")\n\n @job\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secrets': SecretsManagerSecretsResource(\n region_name='us-west-1',\n secrets_tag="dagster",\n add_to_environment=True,\n )\n }\n )\n\n Note that your ops must also declare that they require this resource with or it will not be initialized\n for the execution of their compute functions.\n """\n\n secrets: List[str] = Field(\n default=[], description="An array of AWS Secrets Manager secrets arns to fetch."\n )\n secrets_tag: Optional[str] = Field(\n default=None,\n description="AWS Secrets Manager secrets with this tag will be fetched and made available.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def secrets_in_environment(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Generator[Dict[str, str], None, None]:\n """Yields a dict which maps selected SecretsManager secrets to their string values. Also\n sets chosen secrets as environment variables.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n secrets_manager = construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )\n\n secrets_tag_to_fetch = secrets_tag if secrets_tag is not None else self.secrets_tag\n secrets_to_fetch = secrets if secrets is not None else self.secrets\n\n secret_arns = merge_dicts(\n (\n get_tagged_secrets(secrets_manager, [secrets_tag_to_fetch])\n if secrets_tag_to_fetch\n else {}\n ),\n get_secrets_from_arns(secrets_manager, secrets_to_fetch),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map):\n yield secrets_map\n\n def fetch_secrets(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Dict[str, str]:\n """Fetches secrets from AWS Secrets Manager and returns them as a dict.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n with self.secrets_in_environment(secrets=secrets, secrets_tag=secrets_tag) as secret_values:\n return secret_values
\n\n\nLEGACY_SECRETSMANAGER_SECRETS_SCHEMA = {\n **cast(Shape, SecretsManagerSecretsResource.to_config_schema().as_field().config_type).fields,\n "add_to_environment": LegacyDagsterField(\n bool,\n default_value=False,\n description="Whether to add the secrets to the environment. Defaults to False.",\n ),\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=LEGACY_SECRETSMANAGER_SECRETS_SCHEMA)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = context.resource_config.get("add_to_environment", False)\n if add_to_environment:\n with SecretsManagerSecretsResource.from_resource_context(\n context\n ).secrets_in_environment() as secrets:\n yield secrets\n else:\n yield SecretsManagerSecretsResource.from_resource_context(context).fetch_secrets()
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom typing import Any, Dict, Optional\nfrom unittest import mock\n\nfrom dagster import resource\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom .utils import ResourceNotFoundError\n\n\n@dagster_maintained_resource\n@resource({"account_name": str})\ndef fake_adls2_resource(context):\n    return FakeADLS2Resource(account_name=context.resource_config["account_name"])\n\n\n
[docs]class FakeADLS2Resource(ConfigurableResource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n account_name: str\n storage_account: Optional[str] = None\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def adls2_client(self) -> "FakeADLS2ServiceClient":\n return FakeADLS2ServiceClient(self.account_name)\n\n @property\n @cached_method\n def blob_client(self) -> FakeBlobServiceClient:\n return FakeBlobServiceClient(self.account_name)\n\n @property\n def lease_client_constructor(self) -> Any:\n return FakeLeaseClient
\n\n\nclass FakeLeaseClient:\n def __init__(self, client):\n self.client = client\n self.id = None\n\n # client needs a ref to self to check if a given lease is valid\n self.client._lease = self # noqa: SLF001\n\n def acquire(self, lease_duration=-1):\n if self.id is None:\n self.id = random.randint(0, 2**9)\n else:\n raise Exception("Lease already held")\n\n def release(self):\n self.id = None\n\n def is_valid(self, lease):\n if self.id is None:\n # no lease is held so any operation is valid\n return True\n return lease == self.id\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system: Dict[str, FakeADLS2FileClient] = {}\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n # pass fileclient a ref to self and its name so the file can delete itself\n self._file_system.setdefault(file_path, FakeADLS2FileClient(self, file_path))\n return self._file_system[file_path]\n\n def create_file(self, file):\n # pass fileclient a ref to self and the file's name so the file can delete itself by\n # accessing the self._file_system dict\n self._file_system.setdefault(file, FakeADLS2FileClient(fs_client=self, name=file))\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self, name, fs_client):\n self.name = name\n self.contents = None\n self._lease = None\n self.fs_client = fs_client\n\n @property\n def lease(self):\n return self._lease if self._lease is None else self._lease.id\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n lease_id = None if self._lease is None else self._lease.id\n return {"lease": lease_id}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n def delete_file(self, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n self.fs_client.delete_file(self.name)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return f"adfss://{self.file_system}@{self.account}.dfs.core.windows.net/{self.key}"
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if "b" in mode else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return f"{self._prefix}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Union\n\nfrom dagster import (\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.pythonic_config import ConfigurableIOManager\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom dagster_azure.adls2.resources import ADLS2Resource\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(UPathIOManager):\n    def __init__(\n        self,\n        file_system: Any,\n        adls2_client: Any,\n        blob_client: Any,\n        lease_client_constructor: Any,\n        prefix: str = "dagster",\n    ):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_client_constructor = lease_client_constructor\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n        super().__init__(base_path=UPath(self.prefix))\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading ADLS2 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing ADLS2 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        file_client = self.file_system_client.get_file_client(str(path))\n        with self._acquire_lease(file_client, is_rm=True) as lease:\n            file_client.delete_file(lease=lease, recursive=True)\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in ADLS2\n        return None\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.file_system_client.get_file_client(str(path)).get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str:\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=path,\n        )\n\n    @contextmanager\n    def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]:\n        lease_client = self.lease_client_constructor(client=client)\n        try:\n            lease_client.acquire(lease_duration=self.lease_duration)\n            yield lease_client.id\n        finally:\n            # cannot release a lease on a file that no longer exists, so need to check\n            if not is_rm:\n                lease_client.release()\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        if context.dagster_type.typing_type == type(None):\n            return None\n        file = self.file_system_client.get_file_client(str(path))\n        stream = file.download_file()\n        return pickle.loads(stream.readall())\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing ADLS2 key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        file = self.file_system_client.create_file(str(path))\n        with self._acquire_lease(file) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]class ADLS2PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n\n adls2: ResourceDependency[ADLS2Resource]\n adls2_file_system: str = Field(description="ADLS Gen2 file system name.")\n adls2_prefix: str = Field(\n default="dagster", description="ADLS Gen2 file system prefix to write to."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectADLS2IOManager:\n return PickledObjectADLS2IOManager(\n self.adls2_file_system,\n self.adls2.adls2_client,\n self.adls2.blob_client,\n self.adls2.lease_client_constructor,\n self.adls2_prefix,\n )\n\n def load_input(self, context: "InputContext") -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectADLS2IOManager(ADLS2PickleIOManager):\n """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=ADLS2PickleIOManager.to_config_schema(),\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.resources

\nfrom typing import Any, Dict, Union\n\nfrom azure.identity import DefaultAzureCredential\nfrom azure.storage.filedatalake import DataLakeLeaseClient\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    Field as DagsterField,\n    Permissive,\n    Selector,\n    StringSource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\nfrom typing_extensions import Literal\n\nfrom dagster_azure.blob.utils import BlobServiceClient, create_blob_client\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import DataLakeServiceClient, create_adls2_client\n\n\nclass ADLS2SASToken(Config):\n    credential_type: Literal["sas"] = "sas"\n    token: str\n\n\nclass ADLS2Key(Config):\n    credential_type: Literal["key"] = "key"\n    key: str\n\n\nclass ADLS2DefaultAzureCredential(Config):\n    credential_type: Literal["default_azure_credential"] = "default_azure_credential"\n    kwargs: Dict[str, Any]\n\n\nclass ADLS2BaseResource(ConfigurableResource):\n    storage_account: str = Field(description="The storage account name.")\n    credential: Union[ADLS2SASToken, ADLS2Key, ADLS2DefaultAzureCredential] = Field(\n        discriminator="credential_type", description="The credentials with which to authenticate."\n    )\n\n\nDEFAULT_AZURE_CREDENTIAL_CONFIG = DagsterField(\n    Permissive(\n        description="Uses DefaultAzureCredential to authenticate and passed as keyword arguments",\n    )\n)\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": DagsterField(StringSource, description="The storage account name."),\n    "credential": DagsterField(\n        Selector(\n            {\n                "sas": DagsterField(StringSource, description="SAS token for the account."),\n                "key": DagsterField(StringSource, description="Shared Access Key for the account."),\n                "DefaultAzureCredential": DEFAULT_AZURE_CREDENTIAL_CONFIG,\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]class ADLS2Resource(ADLS2BaseResource):\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _raw_credential(self) -> Any:\n if isinstance(self.credential, ADLS2Key):\n return self.credential.key\n elif isinstance(self.credential, ADLS2SASToken):\n return self.credential.token\n else:\n return DefaultAzureCredential(**self.credential.kwargs)\n\n @property\n @cached_method\n def adls2_client(self) -> DataLakeServiceClient:\n return create_adls2_client(self.storage_account, self._raw_credential)\n\n @property\n @cached_method\n def blob_client(self) -> BlobServiceClient:\n return create_blob_client(self.storage_account, self._raw_credential)\n\n @property\n def lease_client_constructor(self) -> Any:\n return DataLakeLeaseClient
\n\n\n# Due to a limitation of the discriminated union type, we can't directly mirror these old\n# config fields in the new resource config. Instead, we'll just use the old config fields\n# to construct the new config and then use that to construct the resource.\n
[docs]@dagster_maintained_resource\n@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token, a key or by passing the\n `DefaultAzureCredential` object.\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n DefaultAzureCredential: {}\n # dict: The keyword arguments used for DefaultAzureCredential\n # or leave the object empty for no arguments\n DefaultAzureCredential:\n exclude_environment_credential: true\n\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": DagsterField(\n StringSource, description="ADLS Gen2 file system name"\n ),\n "adls2_prefix": DagsterField(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\ndef _adls2_resource_from_config(config) -> ADLS2Resource:\n """Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n if "DefaultAzureCredential" in config["credential"]:\n credential = ADLS2DefaultAzureCredential(\n kwargs=config["credential"]["DefaultAzureCredential"]\n )\n elif "sas" in config["credential"]:\n credential = ADLS2SASToken(token=config["credential"]["sas"])\n else:\n credential = ADLS2Key(key=config["credential"]["key"])\n\n return ADLS2Resource(storage_account=storage_account, credential=credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom azure.identity import DefaultAzureCredential\nfrom dagster import (\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n default_azure_credential:\n exclude_environment_credential: true\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n upload_interval: 30\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (Optional[str]): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n default_azure_credential (Optional[dict]): Use and configure DefaultAzureCredential.\n Cannot be used with sas token or secret key config.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key=None,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n upload_interval=None,\n default_azure_credential=None,\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n self._default_azure_credential = check.opt_dict_param(\n default_azure_credential, "default_azure_credential"\n )\n check.opt_str_param(secret_key, "secret_key")\n check.invariant(\n secret_key is not None or default_azure_credential is not None,\n "Missing config: need to provide one of secret_key or default_azure_credential",\n )\n\n if default_azure_credential is None:\n self._blob_client = create_blob_client(storage_account, secret_key)\n else:\n credential = DefaultAzureCredential(**self._default_azure_credential)\n self._blob_client = create_blob_client(storage_account, credential)\n\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, dagster_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs(dagster_run, step_key): # noqa: SLF001\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": Field(StringSource, is_required=False),\n "default_azure_credential": Field(\n Noneable(Permissive(description="keyword arguments for DefaultAzureCredential")),\n is_required=False,\n default_value=None,\n ),\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _blob_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._blob_prefix, "storage", *namespace, filename]\n return "/".join(paths) # blob path delimiter\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n if log_key:\n prefix_path = "/".join([self._blob_prefix, "storage", *log_key])\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n prefix_path = "/".join([self._blob_prefix, "storage", *prefix, ""])\n else:\n prefix_path = None\n\n blob_list = {\n b.name for b in list(self._container_client.list_blobs(name_starts_with=prefix_path))\n }\n\n to_remove = None\n if log_key:\n # filter to the known set of keys\n known_keys = [\n self._blob_key(log_key, ComputeIOType.STDOUT),\n self._blob_key(log_key, ComputeIOType.STDERR),\n self._blob_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._blob_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n to_remove = [key for key in known_keys if key in blob_list]\n elif prefix:\n to_remove = list(blob_list)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if to_remove:\n self._container_client.delete_blobs(*to_remove)\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n blob_key = self._blob_key(log_key, io_type)\n if blob_key in self._download_urls:\n return self._download_urls[blob_key]\n blob = self._container_client.get_blob_client(blob_key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n blob_key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[blob_key] = url\n return url\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n blob_key = self._blob_key(log_key, io_type)\n return f"https://{self._storage_account}.blob.core.windows.net/{self._container}/{blob_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n blob_objects = self._container_client.list_blobs(blob_key)\n exact_matches = [blob for blob in blob_objects if blob.name == blob_key]\n return len(exact_matches) > 0\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(blob_key)\n blob.upload_blob(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(blob_key)\n blob.download_blob().readinto(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery.executor

\nfrom dagster import (\n    Executor,\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True, # Not actually checked by the celery task\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_job.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_plan",\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_docker.executor

\nimport os\n\nimport docker.client\nfrom dagster import (\n    DagsterInstance,\n    Executor,\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description=(\n                        "The list of environment variables names to forward from the celery worker"\n                        " in to the docker container"\n                    ),\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description=(\n                        "Name of the network this container will be connected to at creation time"\n                    ),\n                ),\n                "container_kwargs": Field(\n                    Permissive(),\n                    is_required=False,\n                    description="Additional keyword args for the docker container",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_docker_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n container_kwargs: # keyword args to be passed to the container. example:\n volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_docker",\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n ),\n as_type=ExecuteStepArgs,\n )\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else dagster_run.job_code_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n f"Executing steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(\n {\n "Step keys": step_keys_str,\n "Image": docker_image,\n "Celery worker": self.request.hostname,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_value(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n container_kwargs = check.opt_dict_param(\n docker_config.get("container_kwargs"), "container_kwargs", key_type=str\n )\n\n # set defaults for detach and auto_remove\n container_kwargs["detach"] = container_kwargs.get("detach", False)\n container_kwargs["auto_remove"] = container_kwargs.get("auto_remove", True)\n\n # if environment variables are provided via container_kwargs, merge with env_vars\n if container_kwargs.get("environment") is not None:\n e_vars = container_kwargs.get("environment")\n if isinstance(e_vars, dict):\n docker_env.update(e_vars)\n else:\n for v in e_vars:\n key, val = v.split("=")\n docker_env[key] = val\n del container_kwargs["environment"]\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=execute_step_args.get_command_args(),\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n **container_kwargs,\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n metadata = {"Job image": docker_image}\n if err.stderr is not None:\n metadata["Docker stderr"] = err.stderr\n\n instance.report_engine_event(\n f"Failed to run steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(metadata),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n events = filter_dagster_events_from_cli_logs(res.split("\\n"))\n serialized_events += [serialize_value(event) for event in events]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    DagsterInstance,\n    Executor,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sJobStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n    DagsterKubernetesClient,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace", run_launcher.job_namespace),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n job_origin = plan_context.reconstructable_job.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n job_origin=job_origin,\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n print_serialized_events=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(job_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_k8s_job",\n )\n\n\ndef construct_step_failure_event_and_handle(dagster_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n job_name=dagster_run.job_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n job_name=dagster_run.job_name,\n run_id=dagster_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n api_client = DagsterKubernetesClient.production_client()\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n f"Task for step {step_key} picked up by Celery",\n dagster_run,\n EngineEventData(\n {\n "Celery worker name": celery_worker_name,\n "Celery worker Kubernetes Pod name": celery_pod_name,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if dagster_run.status != DagsterRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n labels = {\n "dagster/job": dagster_run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.run_id,\n }\n if dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": dagster_run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n ],\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Job image": job_config.job_image,\n "Image pull policy": job_config.image_pull_policy,\n "Image pull secrets": str(job_config.image_pull_secrets),\n "Service account name": str(job_config.service_account_name),\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n api_client.wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n dagster_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sJobStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Kubernetes Job namespace": job_namespace,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n api_client.delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ):\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = api_client.get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n dagster_run,\n EngineEventData({"Pod names": "\\n".join(pod_names)}),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = api_client.retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.exceptions.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_cli_logs(logs)\n serialized_events = [serialize_value(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\nfrom typing import Optional, cast\n\nimport kubernetes\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom dagster._config import process_config, resolve_to_config_type\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.launcher import LaunchRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_k8s.client import DagsterKubernetesClient\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data: Optional[ConfigurableClassData] = None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n job_namespace=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n super().__init__()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n job_origin = cast(JobPythonOrigin, context.job_code_origin)\n repository_origin = job_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your"\n f" executor configuration, but also {job_image} in your user-code"\n f" deployment. Using the job image {job_image_from_executor_config} from"\n " executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. To resolve"\n " this error, specify the job_image configuration in the executor config"\n " section in your run config. \\nNote: You may also be seeing this error because"\n " you are using the configured API. Using configured with the celery-k8s"\n " executor is not supported at this time, and the job_image must be configured"\n " at the top-level executor config without using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(run.tags)\n\n from dagster._cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[{"name": "DAGSTER_RUN_JOB_NAME", "value": job_origin.job_name}],\n )\n\n job_namespace = exc_config.get("job_namespace", self.job_namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=job_namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; delete_job returned {}"\n .format(termination_result)\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; encountered error in delete_job"\n ),\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n dagster_run = self._instance.get_run_by_id(run_id)\n run_config = dagster_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace", self.job_namespace)\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace", self.job_namespace\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n status = self._api_client.get_job_status(namespace=job_namespace, job_name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if CELERY_K8S_CONFIG_KEY not in executor_config:\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured."\n .format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_census": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import CensusOutput\nfrom .utils import generate_materialization\n\n\n
[docs]@op(\n required_resource_keys={"census"},\n ins={"start_after": In(Nothing)},\n out=Out(\n CensusOutput,\n description=(\n "Parsed json dictionary representing the details of the Census sync after "\n "the sync successfully completes."\n ),\n ),\n config_schema={\n "sync_id": Field(\n int,\n is_required=True,\n description="Id of the parent sync.",\n ),\n "force_full_sync": Field(\n config=Bool,\n default_value=False,\n description=(\n "If this trigger request should be a Full Sync. "\n "Note that some sync configurations such as Append do not support full syncs."\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) to wait between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description=(\n "The maximum time to wait before this operation is timed out. By "\n "default, this will never time out."\n ),\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Census sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["census"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "census"},\n)\ndef census_trigger_sync_op(context):\n """Executes a Census sync for a given ``sync_id`` and polls until that sync completes, raising\n an error if it is unsuccessful.\n\n It outputs a :py:class:`~dagster_census.CensusOutput` which contains the details of the Census\n sync after it successfully completes.\n\n It requires the use of the :py:class:`~dagster_census.census_resource`, which allows it to\n communicate with the Census API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource, census_sync_op\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n sync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"census": my_census_resource})\n def my_simple_census_job():\n sync_foobar()\n\n """\n census_output = context.resources.census.trigger_sync_and_poll(\n sync_id=context.op_config["sync_id"],\n force_full_sync=context.op_config["force_full_sync"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield generate_materialization(\n census_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(census_output)
\n
", "current_page_name": "_modules/dagster_census/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional\n\nimport requests\nfrom dagster import Failure, Field, StringSource, __version__, get_dagster_logger, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom .types import CensusOutput\n\nCENSUS_API_BASE = "app.getcensus.com/api"\nCENSUS_VERSION = "v1"\n\nDEFAULT_POLL_INTERVAL = 10\n\nSYNC_RUN_STATUSES = {"completed", "failed", "queued", "skipped", "working"}\n\n\n
[docs]class CensusResource:\n """This class exposes methods on top of the Census REST API."""\n\n def __init__(\n self,\n api_key: str,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self.api_key = api_key\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def _api_key(self):\n if self.api_key.startswith("secret-token:"):\n return self.api_key\n return "secret-token:" + self.api_key\n\n @property\n def api_base_url(self) -> str:\n return f"https://{CENSUS_API_BASE}/{CENSUS_VERSION}"\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Census API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Census API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n url = f"{self.api_base_url}/{endpoint}"\n headers = {\n "User-Agent": f"dagster-census/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=HTTPBasicAuth("bearer", self._api_key),\n data=data,\n )\n response.raise_for_status()\n return response.json()\n except RequestException as e:\n self._log.error("Request to Census API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n def get_sync(self, sync_id: int) -> Mapping[str, Any]:\n """Gets details about a given sync from the Census API.\n\n Args:\n sync_id (int): The Census Sync ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"syncs/{sync_id}")\n\n def get_source(self, source_id: int) -> Mapping[str, Any]:\n """Gets details about a given source from the Census API.\n\n Args:\n source_id (int): The Census Source ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sources/{source_id}")\n\n def get_destination(self, destination_id: int) -> Mapping[str, Any]:\n """Gets details about a given destination from the Census API.\n\n Args:\n destination_id (int): The Census Destination ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"destinations/{destination_id}")\n\n def get_sync_run(self, sync_run_id: int) -> Mapping[str, Any]:\n """Gets details about a specific sync run from the Census API.\n\n Args:\n sync_run_id (int): The Census Sync Run ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sync_runs/{sync_run_id}")\n\n def poll_sync_run(\n self,\n sync_run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Census sync run, poll until the run is complete.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n log_url = f"https://app.getcensus.com/syncs_runs/{sync_run_id}"\n poll_start = datetime.datetime.now()\n\n while True:\n time.sleep(poll_interval)\n response_dict = self.get_sync_run(sync_run_id)\n if "data" not in response_dict.keys():\n raise ValueError(\n f"Getting status of sync failed, please visit Census Logs at {log_url} to see"\n " more."\n )\n\n sync_status = response_dict["data"]["status"]\n sync_id = response_dict["data"]["sync_id"]\n\n if sync_status not in SYNC_RUN_STATUSES:\n raise ValueError(\n f"Unexpected response status '{sync_status}'; "\n f"must be one of {','.join(sorted(SYNC_RUN_STATUSES))}. "\n "See Management API docs for more information: "\n "https://docs.getcensus.com/basics/developers/api/sync-runs"\n )\n\n if sync_status in {"queued", "working"}:\n self._log.debug(\n f"Sync {sync_id} still running after {datetime.datetime.now() - poll_start}."\n )\n continue\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for sync '{sync_id}' timed out after"\n f" {datetime.datetime.now() - poll_start}."\n )\n\n break\n\n self._log.debug(\n f"Sync {sync_id} has finished running after {datetime.datetime.now() - poll_start}."\n )\n self._log.info(f"View sync details here: {log_url}.")\n\n return response_dict\n\n def trigger_sync(self, sync_id: int, force_full_sync: bool = False) -> Mapping[str, Any]:\n """Trigger an asynchronous run for a specific sync.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n data = {"force_full_sync": force_full_sync}\n return self.make_request(\n method="POST", endpoint=f"syncs/{sync_id}/trigger", data=json.dumps(data)\n )\n\n def trigger_sync_and_poll(\n self,\n sync_id: int,\n force_full_sync: bool = False,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> CensusOutput:\n """Trigger a run for a specific sync and poll until it has completed.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~CensusOutput`:\n Object containing details about the sync run and the sync details\n """\n sync_details = self.get_sync(sync_id=sync_id)\n source_details = self.get_source(\n source_id=sync_details["data"]["source_attributes"]["connection_id"]\n )["data"]\n destination_details = self.get_destination(\n destination_id=sync_details["data"]["destination_attributes"]["connection_id"]\n )["data"]\n\n trigger_sync_resp = self.trigger_sync(sync_id=sync_id, force_full_sync=force_full_sync)\n sync_run_details = self.poll_sync_run(\n sync_run_id=trigger_sync_resp["data"]["sync_run_id"],\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )["data"]\n return CensusOutput(\n sync_run=sync_run_details,\n source=source_details,\n destination=destination_details,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Census API Key.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description=(\n "The maximum number of times requests to the Census API should be retried "\n "before failing."\n ),\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Census connectors",\n)\ndef census_resource(context) -> CensusResource:\n """This resource allows users to programatically interface with the Census REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n @job(resource_defs={"census":my_census_resource})\n def my_census_job():\n ...\n\n """\n return CensusResource(\n api_key=context.resource_config["api_key"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_census/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.types

\nfrom typing import Any, Mapping, NamedTuple\n\n\n
[docs]class CensusOutput(\n NamedTuple(\n "_CensusOutput",\n [\n ("sync_run", Mapping[str, Any]),\n ("source", Mapping[str, Any]),\n ("destination", Mapping[str, Any]),\n ],\n )\n):\n """Contains recorded information about the state of a Census sync after a sync completes.\n\n Attributes:\n sync_run (Dict[str, Any]):\n The details of the specific sync run.\n source (Dict[str, Any]):\n Information about the source for the Census sync.\n destination (Dict[str, Any]):\n Information about the destination for the Census sync.\n """
\n
", "current_page_name": "_modules/dagster_census/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.types"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dask.executor

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dask\nimport dask.distributed\nfrom dagster import (\n    Executor,\n    Field,\n    Permissive,\n    Selector,\n    StringSource,\n    _check as check,\n    _seven,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.definitions.executor_definition import executor\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, execute_plan\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils import iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies: Any,\n recon_job: ReconstructableJob,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]],\n step_keys: Optional[Sequence[str]],\n instance_ref: InstanceRef,\n known_state: Optional[KnownExecutionState],\n) -> Sequence[DagsterEvent]:\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_job = recon_job.get_subset(op_selection=dagster_run.resolved_op_selection)\n\n execution_plan = create_execution_plan(\n subset_job,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_job, instance, dagster_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags: Mapping[str, str]):\n check.mapping_param(tags, "tags", key_type=str, value_type=str)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return _seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n f"Expected executor to be DaskExecutor got {plan_context.executor}",\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n job_name = plan_context.job_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(job_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(job_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(job_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(job_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(job_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(job_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(job_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(job_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(job_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(job_name))\n else:\n raise ValueError(\n "Must be providing one of the following ('existing', 'local', 'yarn', 'ssh',"\n f" 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n run_config = plan_context.run_config\n\n dask_task_name = "%s.%s" % (job_name, step.key)\n\n recon_job = plan_context.reconstructable_job\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_job,\n plan_context.dagster_run,\n run_config,\n [step.key],\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, job_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": job_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport logging\nimport time\nfrom typing import IO, Any, Mapping, Optional, Tuple, Union, cast\n\nimport dagster\nimport dagster._check as check\nimport dagster_pyspark\nimport databricks_api\nimport databricks_cli.sdk\nimport requests.exceptions\nfrom dagster._annotations import deprecated, public\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import compute, jobs\nfrom typing_extensions import Final\n\nimport dagster_databricks\n\nfrom .types import (\n    DatabricksRunState,\n)\nfrom .version import __version__\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC: Final = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\n
[docs]class DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n workspace_id: Optional[str] = None,\n ):\n self.host = host\n self.workspace_id = workspace_id\n\n self._workspace_client = WorkspaceClient(\n host=host,\n token=token,\n client_id=oauth_client_id,\n client_secret=oauth_client_secret,\n product="dagster-databricks",\n product_version=__version__,\n )\n\n # TODO: This is the old shim client that we were previously using. Arguably this is\n # confusing for users to use since this is an unofficial wrapper around the documented\n # Databricks REST API. We should consider removing this in the next minor release.\n if token:\n self._client = databricks_api.DatabricksAPI(host=host, token=token)\n self.__setup_user_agent(self._client.client)\n # TODO: This is the old `databricks_cli` client that was previously recommended by Databricks.\n # It is no longer supported and should be removed in favour of `databricks-sdk` in the next\n # minor release.\n self._api_client = databricks_cli.sdk.ApiClient(host=host, token=token)\n self.__setup_user_agent(self._api_client)\n else:\n self._client = None\n self._api_client = None\n\n def __setup_user_agent(\n self,\n client: Union[WorkspaceClient, databricks_api.DatabricksAPI, databricks_cli.sdk.ApiClient],\n ) -> None:\n """Overrides the user agent for the Databricks API client."""\n client.default_headers["user-agent"] = f"dagster-databricks/{__version__}"\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def client(self) -> databricks_api.DatabricksAPI:\n """Retrieve the legacy Databricks API client. Note: accessing this property will throw an exception if oauth\n credentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\n legacy Databricks API client.\n """\n if self._client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-api` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._client\n\n @client.setter\n def client(self, value: Optional[databricks_api.DatabricksAPI]) -> None:\n self._client = value\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def api_client(self) -> databricks_cli.sdk.ApiClient:\n """Retrieve a reference to the underlying Databricks API client. For more information,\n see the `Databricks Python API <https://docs.databricks.com/dev-tools/python-api.html>`_.\n Noe: accessing this property will throw an exception if oauth credentials are used to initialize the\n DatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\n **Examples:**.\n\n .. code-block:: python\n\n from dagster import op\n from databricks_cli.jobs.api import JobsApi\n from databricks_cli.runs.api import RunsApi\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n jobs_client = JobsApi(context.resources.databricks_client.api_client)\n runs_client = RunsApi(context.resources.databricks_client.api_client)\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n jobs_client.run_now(...)\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n runs_client.submit_run(...)\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n runs_client.get_run(...)\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n runs_client.cancel_run(...)\n client.jobs.cancel_run(...)\n\n Returns:\n ApiClient: The authenticated Databricks API client.\n """\n if self._api_client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-cli` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._api_client\n\n @public\n @property\n def workspace_client(self) -> WorkspaceClient:\n """Retrieve a reference to the underlying Databricks Workspace client. For more information,\n see the `Databricks SDK for Python <https://docs.databricks.com/dev-tools/sdk-python.html>`_.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n client.jobs.cancel_run(...)\n\n Returns:\n WorkspaceClient: The authenticated Databricks SDK Workspace Client.\n """\n return self._workspace_client\n\n def read_file(self, dbfs_path: str, block_size: int = 1024**2) -> bytes:\n """Read a file from DBFS to a **byte string**."""\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n data = b""\n bytes_read = 0\n dbfs_service = self.workspace_client.dbfs\n\n jdoc = dbfs_service.read(path=dbfs_path, length=block_size)\n data += base64.b64decode(jdoc.data)\n while jdoc.bytes_read == block_size:\n bytes_read += jdoc.bytes_read\n jdoc = dbfs_service.read(path=dbfs_path, offset=bytes_read, length=block_size)\n data += base64.b64decode(jdoc.data)\n\n return data\n\n def put_file(\n self, file_obj: IO, dbfs_path: str, overwrite: bool = False, block_size: int = 1024**2\n ) -> None:\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n dbfs_service = self.workspace_client.dbfs\n\n create_response = dbfs_service.create(path=dbfs_path, overwrite=overwrite)\n handle = create_response.handle\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n dbfs_service.add_block(data=data, handle=handle)\n block = file_obj.read(block_size)\n\n dbfs_service.close(handle=handle)\n\n def get_run_state(self, databricks_run_id: int) -> "DatabricksRunState":\n """Get the state of a run by Databricks run ID.\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.workspace_client.jobs.get_run(databricks_run_id)\n return DatabricksRunState.from_databricks(run.state)\n\n def poll_run_state(\n self,\n logger: logging.Logger,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n verbose_logs: bool = True,\n ) -> bool:\n run_state = self.get_run_state(databricks_run_id)\n\n if run_state.has_terminated():\n if run_state.is_successful():\n logger.info(f"Run `{databricks_run_id}` completed successfully.")\n return True\n if run_state.is_skipped():\n logger.info(f"Run `{databricks_run_id}` was skipped.")\n return True\n else:\n error_message = (\n f"Run `{databricks_run_id}` failed with result state:"\n f" `{run_state.result_state}`. Message: {run_state.state_message}."\n )\n logger.error(error_message)\n raise DatabricksError(error_message)\n else:\n if verbose_logs:\n logger.debug(f"Run `{databricks_run_id}` in state {run_state}.")\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n f"Run `{databricks_run_id}` took more than {max_wait_time_sec}s to complete."\n " Failing the run."\n )\n return False\n\n def wait_for_run_to_complete(\n self,\n logger: logging.Logger,\n databricks_run_id: int,\n poll_interval_sec: float,\n max_wait_time_sec: int,\n verbose_logs: bool = True,\n ) -> None:\n logger.info(f"Waiting for Databricks run `{databricks_run_id}` to complete...")\n\n start_poll_time = time.time()\n while True:\n if self.poll_run_state(\n logger=logger,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=max_wait_time_sec,\n verbose_logs=verbose_logs,\n ):\n return\n\n time.sleep(poll_interval_sec)
\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress.\n\n Attributes:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net.\n token (str): Databricks authentication token.\n poll_interval_sec (float): How often to poll Databricks for run status.\n max_wait_time_sec (int): How long to wait for a run to complete before failing.\n """\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n poll_interval_sec: float = 5,\n max_wait_time_sec: int = DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ):\n self.host = check.str_param(host, "host")\n check.invariant(\n token is None or (oauth_client_id is None and oauth_client_secret is None),\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.token = check.opt_str_param(token, "token")\n self.oauth_client_id = check.opt_str_param(oauth_client_id, "oauth_client_id")\n self.oauth_client_secret = check.opt_str_param(oauth_client_secret, "oauth_client_secret")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client: DatabricksClient = DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=oauth_client_id,\n oauth_client_secret=oauth_client_secret,\n )\n\n @property\n def client(self) -> DatabricksClient:\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config: Mapping[str, Any], task: Mapping[str, Any]) -> int:\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", {})\n if isinstance(tags, list):\n tags = {x["key"]: x["value"] for x in tags}\n tags["__dagster_version"] = dagster.__version__\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n install_default_libraries = run_config.get("install_default_libraries", True)\n if install_default_libraries:\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-")\n for x in libraries\n if "pypi" in x\n }\n\n for library_name, library in [\n ("dagster", dagster),\n ("dagster-databricks", dagster_databricks),\n ("dagster-pyspark", dagster_pyspark),\n ]:\n if library_name not in python_libraries:\n libraries.append(\n {"pypi": {"package": f"{library_name}=={library.__version__}"}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n return self.client.workspace_client.jobs.submit(\n run_name=run_config.get("run_name"),\n tasks=[\n jobs.SubmitTask.from_dict(\n {\n "new_cluster": new_cluster,\n "existing_cluster_id": existing_cluster_id,\n # "libraries": [compute.Library.from_dict(lib) for lib in libraries],\n "libraries": libraries,\n **task,\n "task_key": "dagster-task",\n },\n )\n ],\n ).bind()["run_id"]\n\n def retrieve_logs_for_run_id(\n self, log: logging.Logger, databricks_run_id: int\n ) -> Optional[Tuple[Optional[str], Optional[str]]]:\n """Retrieve the stdout and stderr logs for a run."""\n run = self.client.workspace_client.jobs.get_run(databricks_run_id)\n\n # Run.cluster_instance can be None. In that case, fall back to cluster instance on first\n # task. Currently pyspark step launcher runs jobs with singleton tasks.\n cluster_instance = run.cluster_instance or run.tasks[0].cluster_instance\n cluster_id = check.inst(\n cluster_instance.cluster_id,\n str,\n "cluster_id should be string like `1234-123456-abcdefgh` got:"\n f" `{cluster_instance.cluster_id}`",\n )\n cluster = self.client.workspace_client.clusters.get(cluster_id)\n log_config = cluster.cluster_log_conf\n if log_config is None:\n log.warn(\n f"Logs not configured for cluster {cluster_id} used for run {databricks_run_id}"\n )\n return None\n if cast(Optional[compute.S3StorageInfo], log_config.s3) is not None:\n logs_prefix = log_config.s3.destination\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif cast(Optional[compute.DbfsStorageInfo], log_config.dbfs) is not None:\n logs_prefix = log_config.dbfs.destination\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self,\n log: logging.Logger,\n prefix: str,\n cluster_id: str,\n filename: str,\n waiter_delay: int = 10,\n waiter_max_attempts: int = 10,\n ) -> Optional[str]:\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info(f"Retrieving logs from {path}")\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport gzip\nimport io\nimport os.path\nimport pickle\nimport sys\nimport tempfile\nimport time\nimport zlib\nfrom typing import Any, Dict, Iterator, Mapping, Optional, Sequence, cast\n\nfrom dagster import (\n    Bool,\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher, StepRunRef\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._serdes import deserialize_value\nfrom dagster._utils.backoff import backoff\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom databricks.sdk.core import DatabricksError\nfrom databricks.sdk.service import jobs\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n)\n\nfrom .configs import (\n    define_databricks_env_variables,\n    define_databricks_permissions,\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n    define_oauth_credentials,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\nDAGSTER_SYSTEM_ENV_VARS = {\n    "DAGSTER_CLOUD_DEPLOYMENT_NAME",\n    "DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT",\n    "DAGSTER_CLOUD_GIT_SHA",\n    "DAGSTER_CLOUD_GIT_TIMESTAMP",\n    "DAGSTER_CLOUD_GIT_AUTHOR_EMAIL",\n    "DAGSTER_CLOUD_GIT_AUTHOR_NAME",\n    "DAGSTER_CLOUD_GIT_MESSAGE",\n    "DAGSTER_CLOUD_GIT_BRANCH",\n    "DAGSTER_CLOUD_GIT_REPO",\n    "DAGSTER_CLOUD_PULL_REQUEST_ID",\n    "DAGSTER_CLOUD_PULL_REQUEST_STATUS",\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "permissions": define_databricks_permissions(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n Noneable(StringSource),\n default_value=None,\n description="Databricks access token",\n ),\n "oauth_credentials": define_oauth_credentials(),\n "env_variables": define_databricks_env_variables(),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, and if the specified cluster is configured to export logs, the system will"\n " wait after job completion for the logs to appear in the configured location. Note"\n " that logs are copied every 5 minutes, so enabling this will add several minutes"\n " to the job runtime. NOTE: this integration will export stdout/stderrfrom the"\n " remote Databricks process automatically, so this option is not generally"\n " necessary."\n ),\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description=(\n "If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step."\n ),\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description=(\n "How frequently Dagster will poll Databricks to determine the state of the job."\n ),\n ),\n "verbose_logs": Field(\n bool,\n default_value=True,\n description=(\n "Determines whether to display debug logs emitted while job is being polled. It can"\n " be helpful for Dagster UI performance to set to False when running long-running"\n " or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of"\n " debug logs."\n ),\n ),\n "add_dagster_env_variables": Field(\n bool,\n default_value=True,\n description=(\n "Automatically add Dagster system environment variables. This option is only"\n " applicable when the code being executed is deployed on Dagster Cloud. It will be"\n " ignored when the environment variables provided by Dagster Cloud are not present."\n ),\n ),\n }\n)\ndef databricks_pyspark_step_launcher(\n context: InitResourceContext,\n) -> "DatabricksPySparkStepLauncher":\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config: Mapping[str, Any],\n permissions: Mapping[str, Any],\n databricks_host: str,\n secrets_to_env_variables: Sequence[Mapping[str, Any]],\n staging_prefix: str,\n wait_for_logs: bool,\n max_completion_wait_time_seconds: int,\n databricks_token: Optional[str] = None,\n oauth_credentials: Optional[Mapping[str, str]] = None,\n env_variables: Optional[Mapping[str, str]] = None,\n storage: Optional[Mapping[str, Any]] = None,\n poll_interval_sec: int = 5,\n local_pipeline_package_path: Optional[str] = None,\n local_dagster_job_package_path: Optional[str] = None,\n verbose_logs: bool = True,\n add_dagster_env_variables: bool = True,\n ):\n self.run_config = check.mapping_param(run_config, "run_config")\n self.permissions = check.mapping_param(permissions, "permissions")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n\n check.invariant(\n databricks_token is not None or oauth_credentials is not None,\n "Must provide either databricks_token or oauth_credentials",\n )\n check.invariant(\n databricks_token is None or oauth_credentials is None,\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.databricks_token = check.opt_str_param(databricks_token, "databricks_token")\n oauth_credentials = check.opt_mapping_param(\n oauth_credentials,\n "oauth_credentials",\n key_type=str,\n value_type=str,\n )\n\n self.secrets = check.sequence_param(\n secrets_to_env_variables, "secrets_to_env_variables", dict\n )\n self.env_variables = check.opt_mapping_param(env_variables, "env_variables")\n self.storage = check.opt_mapping_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or"\n " 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and"\n " 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n oauth_client_id=oauth_credentials.get("client_id"),\n oauth_client_secret=oauth_credentials.get("client_secret"),\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n self.verbose_logs = check.bool_param(verbose_logs, "verbose_logs")\n self.add_dagster_env_variables = check.bool_param(\n add_dagster_env_variables, "add_dagster_env_variables"\n )\n\n def launch_step(self, step_context: StepExecutionContext) -> Iterator[DagsterEvent]:\n step_run_ref = step_context_to_step_run_ref(\n step_context, self.local_dagster_job_package_path\n )\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n if self.permissions:\n self._grant_permissions(log, databricks_run_id)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n except:\n # if executon is interrupted before the step is completed, cancel the run\n self.databricks_runner.client.workspace_client.jobs.cancel_run(databricks_run_id)\n raise\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log: DagsterLogManager, run_id: str, step_key: str) -> None:\n try:\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n sys.stdout.write(stdout)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stdout logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n try:\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n sys.stderr.write(stderr)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stderr logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n\n def step_events_iterator(\n self, step_context: StepExecutionContext, step_key: str, databricks_run_id: int\n ) -> Iterator[DagsterEvent]:\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start_poll_time = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n if self.verbose_logs:\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = self.databricks_runner.client.poll_run_state(\n logger=step_context.log,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=self.databricks_runner.max_wait_time_sec,\n verbose_logs=self.verbose_logs,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.get_dagster_event()\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(\n self, run_id: str, step_key: str, retry_number: int\n ) -> Sequence[EventLogEntry]:\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records() -> Sequence[EventLogEntry]:\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return cast(\n Sequence[EventLogEntry],\n deserialize_value(pickle.loads(gzip.decompress(serialized_records))),\n )\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError, OSError, zlib.error, EOFError),\n max_retries=4,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except DatabricksError as e:\n if e.error_code == "RESOURCE_DOES_NOT_EXIST":\n return []\n raise\n\n def _grant_permissions(\n self, log: DagsterLogManager, databricks_run_id: int, request_retries: int = 3\n ) -> None:\n client = self.databricks_runner.client.workspace_client\n # Retrieve run info\n cluster_id = None\n for i in range(1, request_retries + 1):\n run_info = client.jobs.get_run(databricks_run_id)\n # if a new job cluster is created, the cluster_instance key may not be immediately present in the run response\n try:\n cluster_id = run_info.cluster_instance.cluster_id\n break\n except:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id}. "\n f"Retrying {i} of {request_retries} times."\n )\n time.sleep(5)\n if not cluster_id:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id} "\n f"{request_retries} times. Skipping permission updates..."\n )\n return\n\n # Update job permissions\n if "job_permissions" in self.permissions:\n job_permissions = self._format_permissions(self.permissions["job_permissions"])\n job_id = run_info.job_id # type: ignore # (??)\n log.debug(f"Updating job permissions with following json: {job_permissions}")\n client.permissions.update("jobs", job_id, access_control_list=job_permissions)\n log.info("Successfully updated cluster permissions")\n\n # Update cluster permissions\n if "cluster_permissions" in self.permissions:\n if "existing" in self.run_config["cluster"]:\n raise ValueError(\n "Attempting to update permissions of an existing cluster. "\n "This is dangerous and thus unsupported."\n )\n cluster_permissions = self._format_permissions(self.permissions["cluster_permissions"])\n log.debug(f"Updating cluster permissions with following json: {cluster_permissions}")\n client.permissions.update(\n "clusters", cluster_id, access_control_list=cluster_permissions\n )\n log.info("Successfully updated cluster permissions")\n\n def _format_permissions(\n self, input_permissions: Mapping[str, Sequence[Mapping[str, str]]]\n ) -> Sequence[Mapping[str, str]]:\n access_control_list = []\n for permission, accessors in input_permissions.items():\n access_control_list.extend(\n [\n jobs.JobAccessControlRequest.from_dict(\n {"permission_level": permission, **accessor}\n )\n for accessor in accessors\n ]\n )\n return access_control_list\n\n def _get_databricks_task(self, run_id: str, step_key: str) -> Mapping[str, Any]:\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(\n self, log: DagsterLogManager, step_run_ref: StepRunRef, run_id: str, step_key: str\n ) -> None:\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = self.create_remote_config()\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def get_dagster_env_variables(self) -> Dict[str, str]:\n out = {}\n if self.add_dagster_env_variables:\n for var in DAGSTER_SYSTEM_ENV_VARS:\n if os.getenv(var):\n out.update({var: os.getenv(var)})\n return out\n\n def create_remote_config(self) -> "DatabricksConfig":\n env_variables = self.get_dagster_env_variables()\n env_variables.update(self.env_variables)\n databricks_config = DatabricksConfig(\n env_variables=env_variables,\n storage=self.storage,\n secrets=self.secrets,\n )\n return databricks_config\n\n def _log_logs_from_cluster(self, log: DagsterLogManager, run_id: int) -> None:\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self) -> str:\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self) -> str:\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"dbfs://{path}"\n\n def _internal_dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"/dbfs/{path}"\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(\n self,\n env_variables: Mapping[str, str],\n storage: Mapping[str, Any],\n secrets: Sequence[Mapping[str, Any]],\n ):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.env_variables = env_variables\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils: Any, sc: Any) -> None:\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils: Any, sc: Any) -> None:\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key) # noqa: SLF001\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key) # noqa: SLF001\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # noqa: SLF001\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils: Any) -> None:\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for env_k, env_v in self.env_variables.items():\n os.environ[env_k] = env_v\n\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print(f"Exporting {name} from Databricks secret {key}, scope {scope}") # noqa: T201\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.ops

\nfrom typing import TYPE_CHECKING, Optional\n\nfrom dagster import (\n    In,\n    Nothing,\n    OpExecutionContext,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom databricks.sdk.service import jobs\nfrom pydantic import Field\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n# wait at most 24 hours by default for run execution\nDEFAULT_MAX_WAIT_TIME_SECONDS = 24 * 60 * 60\nfrom dagster import Config\n\nif TYPE_CHECKING:\n    from .databricks import DatabricksClient\n\n\n
[docs]def create_databricks_run_now_op(\n databricks_job_id: int,\n databricks_job_configuration: Optional[dict] = None,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that launches an existing databricks job.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\n ``job_id``, which is the ID of the job to be executed. Additional fields can be used to specify\n override parameters for the Databricks Job.\n\n Arguments:\n databricks_job_id (int): The ID of the Databricks Job to be executed.\n databricks_job_configuration (dict): Configuration for triggering a new job run of a\n Databricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_run_now_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to run the Databricks Job.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\n DATABRICKS_JOB_ID = 1234\n\n\n run_now_op = create_databricks_run_now_op(\n databricks_job_id=DATABRICKS_JOB_ID,\n databricks_job_configuration={\n "python_params": [\n "--input",\n "schema.db.input_table",\n "--output",\n "schema.db.output_table",\n ],\n },\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n run_now_op()\n """\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksRunNowOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_run_now_op(context: OpExecutionContext, config: DatabricksRunNowOpConfig):\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.run_now(\n job_id=databricks_job_id,\n **(databricks_job_configuration or {}),\n )\n run_id = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_run_now_op
\n\n\n
[docs]def create_databricks_submit_run_op(\n databricks_job_configuration: dict,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that submits a one-time run of a set of tasks on Databricks.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/submit.\n\n Arguments:\n databricks_job_configuration (dict): Configuration for submitting a one-time run of a set\n of tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_submit_run_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to submit a one-time run of a set of tasks on Databricks.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\n submit_run_op = create_databricks_submit_run_op(\n databricks_job_configuration={\n "new_cluster": {\n "spark_version": '2.1.0-db3-scala2.11',\n "num_workers": 2\n },\n "notebook_task": {\n "notebook_path": "/Users/dagster@example.com/PrepareData",\n },\n }\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n submit_run_op()\n """\n check.invariant(\n bool(databricks_job_configuration),\n "Configuration for the one-time Databricks Job is required.",\n )\n\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksSubmitRunOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_submit_run_op(\n context: OpExecutionContext, config: DatabricksSubmitRunOpConfig\n ) -> None:\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.submit(\n tasks=[jobs.SubmitTask.from_dict(databricks_job_configuration)],\n )\n run_id: int = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_submit_run_op
\n
", "current_page_name": "_modules/dagster_databricks/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.ops"}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.pipes

\nimport base64\nimport json\nimport os\nimport random\nimport string\nimport sys\nimport time\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Iterator, Literal, Mapping, Optional, TextIO\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.resource_annotation import ResourceParam\nfrom dagster._core.errors import DagsterPipesExecutionError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.utils import (\n    PipesBlobStoreMessageReader,\n    PipesBlobStoreStdioReader,\n    PipesChunkedStdioReader,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    DAGSTER_PIPES_MESSAGES_ENV_VAR,\n    PipesContextData,\n    PipesExtras,\n    PipesParams,\n)\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import files, jobs\nfrom pydantic import Field\n\n# Number of seconds between status checks on Databricks jobs launched by the\n# `PipesDatabricksClient`.\n_RUN_POLL_INTERVAL = 5\n\n\n@experimental\nclass _PipesDatabricksClient(PipesClient):\n    """Pipes client for databricks.\n\n    Args:\n        client (WorkspaceClient): A databricks `WorkspaceClient` object.\n        env (Optional[Mapping[str,str]]: An optional dict of environment variables to pass to the\n            databricks job.\n        context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n            context into the k8s container process. Defaults to :py:class:`PipesDbfsContextInjector`.\n        message_reader (Optional[PipesMessageReader]): A message reader to use to read messages\n            from the databricks job. Defaults to :py:class:`PipesDbfsMessageReader`.\n    """\n\n    env: Optional[Mapping[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n\n    def __init__(\n        self,\n        client: WorkspaceClient,\n        env: Optional[Mapping[str, str]] = None,\n        context_injector: Optional[PipesContextInjector] = None,\n        message_reader: Optional[PipesMessageReader] = None,\n    ):\n        self.client = client\n        self.env = env\n        self.context_injector = check.opt_inst_param(\n            context_injector,\n            "context_injector",\n            PipesContextInjector,\n        ) or PipesDbfsContextInjector(client=self.client)\n        self.message_reader = check.opt_inst_param(\n            message_reader,\n            "message_reader",\n            PipesMessageReader,\n        ) or PipesDbfsMessageReader(\n            client=self.client,\n            stdout_reader=PipesDbfsStdioReader(\n                client=self.client, remote_log_name="stdout", target_stream=sys.stdout\n            ),\n            stderr_reader=PipesDbfsStdioReader(\n                client=self.client, remote_log_name="stderr", target_stream=sys.stderr\n            ),\n        )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def run(\n        self,\n        *,\n        context: OpExecutionContext,\n        extras: Optional[PipesExtras] = None,\n        task: jobs.SubmitTask,\n        submit_args: Optional[Mapping[str, str]] = None,\n    ) -> PipesClientCompletedInvocation:\n        """Synchronously execute a Databricks job with the pipes protocol.\n\n        Args:\n            task (databricks.sdk.service.jobs.SubmitTask): Specification of the databricks\n                task to run. Environment variables used by dagster-pipes will be set under the\n                `spark_env_vars` key of the `new_cluster` field (if there is an existing dictionary\n                here, the EXT environment variables will be merged in). Everything else will be\n                passed unaltered under the `tasks` arg to `WorkspaceClient.jobs.submit`.\n            context (OpExecutionContext): The context from the executing op or asset.\n            extras (Optional[PipesExtras]): An optional dict of extra parameters to pass to the\n                subprocess.\n            submit_args (Optional[Mapping[str, str]]): Additional keyword arguments that will be\n                forwarded as-is to `WorkspaceClient.jobs.submit`.\n\n        Returns:\n            PipesClientCompletedInvocation: Wrapper containing results reported by the external\n                process.\n        """\n        with open_pipes_session(\n            context=context,\n            extras=extras,\n            context_injector=self.context_injector,\n            message_reader=self.message_reader,\n        ) as pipes_session:\n            submit_task_dict = task.as_dict()\n            submit_task_dict["new_cluster"]["spark_env_vars"] = {\n                **submit_task_dict["new_cluster"].get("spark_env_vars", {}),\n                **(self.env or {}),\n                **pipes_session.get_bootstrap_env_vars(),\n            }\n            cluster_log_root = pipes_session.get_bootstrap_params()[\n                DAGSTER_PIPES_MESSAGES_ENV_VAR\n            ].get("cluster_log_root")\n            if cluster_log_root is not None:\n                submit_task_dict["new_cluster"]["cluster_log_conf"] = {\n                    "dbfs": {"destination": f"dbfs:{cluster_log_root}"}\n                }\n            task = jobs.SubmitTask.from_dict(submit_task_dict)\n            run_id = self.client.jobs.submit(\n                tasks=[task],\n                **(submit_args or {}),\n            ).bind()["run_id"]\n\n            while True:\n                run = self.client.jobs.get_run(run_id)\n                context.log.info(\n                    f"Databricks run {run_id} current state: {run.state.life_cycle_state}"\n                )\n                if run.state.life_cycle_state in (\n                    jobs.RunLifeCycleState.TERMINATED,\n                    jobs.RunLifeCycleState.SKIPPED,\n                ):\n                    if run.state.result_state == jobs.RunResultState.SUCCESS:\n                        break\n                    else:\n                        raise DagsterPipesExecutionError(\n                            f"Error running Databricks job: {run.state.state_message}"\n                        )\n                elif run.state.life_cycle_state == jobs.RunLifeCycleState.INTERNAL_ERROR:\n                    raise DagsterPipesExecutionError(\n                        f"Error running Databricks job: {run.state.state_message}"\n                    )\n                time.sleep(_RUN_POLL_INTERVAL)\n        return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n\nPipesDatabricksClient = ResourceParam[_PipesDatabricksClient]\n\n_CONTEXT_FILENAME = "context.json"\n\n\n@contextmanager\ndef dbfs_tempdir(dbfs_client: files.DbfsAPI) -> Iterator[str]:\n    dirname = "".join(random.choices(string.ascii_letters, k=30))\n    tempdir = f"/tmp/{dirname}"\n    dbfs_client.mkdirs(tempdir)\n    try:\n        yield tempdir\n    finally:\n        dbfs_client.delete(tempdir, recursive=True)\n\n\n
[docs]@experimental\nclass PipesDbfsContextInjector(PipesContextInjector):\n """A context injector that injects context into a Databricks job by writing a JSON file to DBFS.\n\n Args:\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n """\n\n def __init__(self, *, client: WorkspaceClient):\n super().__init__()\n self.dbfs_client = files.DbfsAPI(client.api_client)\n\n @contextmanager\n def inject_context(self, context: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to an automatically-generated\n DBFS temporary file as JSON and exposing the path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with dbfs_tempdir(self.dbfs_client) as tempdir:\n path = os.path.join(tempdir, _CONTEXT_FILENAME)\n contents = base64.b64encode(json.dumps(context).encode("utf-8")).decode("utf-8")\n self.dbfs_client.put(path, contents=contents, overwrite=True)\n yield {"path": path}\n\n def no_messages_debug_text(self) -> str:\n return (\n "Attempted to inject context via a temporary file in dbfs. Expected"\n " PipesDbfsContextLoader to be explicitly passed to open_dagster_pipes in the external"\n " process."\n )
\n\n\n
[docs]@experimental\nclass PipesDbfsMessageReader(PipesBlobStoreMessageReader):\n """Message reader that reads messages by periodically reading message chunks from an\n automatically-generated temporary directory on DBFS.\n\n If `stdout_reader` or `stderr_reader` are passed, this reader will also start them when\n `read_messages` is called. If they are not passed, then the reader performs no stdout/stderr\n forwarding.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n cluster_log_root (Optional[str]): The root path on DBFS where the cluster logs are written.\n If set, this will be used to read stderr/stdout logs.\n stdout_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stdout logs.\n stderr_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stderr logs.\n """\n\n def __init__(\n self,\n *,\n interval: float = 10,\n client: WorkspaceClient,\n stdout_reader: Optional[PipesBlobStoreStdioReader] = None,\n stderr_reader: Optional[PipesBlobStoreStdioReader] = None,\n ):\n super().__init__(\n interval=interval, stdout_reader=stdout_reader, stderr_reader=stderr_reader\n )\n self.dbfs_client = files.DbfsAPI(client.api_client)\n\n @contextmanager\n def get_params(self) -> Iterator[PipesParams]:\n with ExitStack() as stack:\n params: PipesParams = {}\n params["path"] = stack.enter_context(dbfs_tempdir(self.dbfs_client))\n if self.stdout_reader or self.stderr_reader:\n params["cluster_log_root"] = stack.enter_context(dbfs_tempdir(self.dbfs_client))\n yield params\n\n def download_messages_chunk(self, index: int, params: PipesParams) -> Optional[str]:\n message_path = os.path.join(params["path"], f"{index}.json")\n try:\n raw_message = self.dbfs_client.read(message_path)\n # Files written to dbfs using the Python IO interface used in PipesDbfsMessageWriter are\n # base64-encoded.\n return base64.b64decode(raw_message.data).decode("utf-8")\n # An error here is an expected result, since an IOError will be thrown if the next message\n # chunk doesn't yet exist. Swallowing the error here is equivalent to doing a no-op on a\n # status check showing a non-existent file.\n except IOError:\n return None\n\n def no_messages_debug_text(self) -> str:\n return (\n "Attempted to read messages from a temporary file in dbfs. Expected"\n " PipesDbfsMessageWriter to be explicitly passed to open_dagster_pipes in the external"\n " process."\n )
\n\n\n@experimental\nclass PipesDbfsStdioReader(PipesChunkedStdioReader):\n """Reader that reads stdout/stderr logs from DBFS.\n\n Args:\n interval (float): interval in seconds between attempts to download a log chunk\n remote_log_name (Literal["stdout", "stderr"]): The name of the log file to read.\n target_stream (TextIO): The stream to which to forward log chunk that have been read.\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n """\n\n def __init__(\n self,\n *,\n interval: float = 10,\n remote_log_name: Literal["stdout", "stderr"],\n target_stream: TextIO,\n client: WorkspaceClient,\n ):\n super().__init__(interval=interval, target_stream=target_stream)\n self.dbfs_client = files.DbfsAPI(client.api_client)\n self.remote_log_name = remote_log_name\n self.log_position = 0\n self.log_path = None\n\n def download_log_chunk(self, params: PipesParams) -> Optional[str]:\n log_path = self._get_log_path(params)\n if log_path is None:\n return None\n else:\n try:\n read_response = self.dbfs_client.read(log_path)\n assert read_response.data\n content = base64.b64decode(read_response.data).decode("utf-8")\n chunk = content[self.log_position :]\n self.log_position = len(content)\n return chunk\n except IOError:\n return None\n\n def is_ready(self, params: PipesParams) -> bool:\n return self._get_log_path(params) is not None\n\n # The directory containing logs will not exist until either 5 minutes have elapsed or the\n # job has finished.\n def _get_log_path(self, params: PipesParams) -> Optional[str]:\n if self.log_path is None:\n log_root_path = os.path.join(params["cluster_log_root"])\n child_dirs = list(self.dbfs_client.list(log_root_path))\n if len(child_dirs) > 0:\n self.log_path = f"dbfs:{child_dirs[0].path}/driver/{self.remote_log_name}"\n return self.log_path\n
", "current_page_name": "_modules/dagster_databricks/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.pipes"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.resources

\nfrom typing import Any, Optional\n\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field, root_validator\n\nfrom .databricks import DatabricksClient\n\n\nclass OauthCredentials(Config):\n    """OAuth credentials for Databricks.\n\n    See https://docs.databricks.com/dev-tools/api/latest/authentication.html#oauth-2-0.\n    """\n\n    client_id: str = Field(description="OAuth client ID")\n    client_secret: str = Field(description="OAuth client secret")\n\n\n
[docs]class DatabricksClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource which provides a Python client for interacting with Databricks within an\n op or asset.\n """\n\n host: str = Field(description="Databricks host, e.g. https://uksouth.azuredatabricks.com")\n token: Optional[str] = Field(default=None, description="Databricks access token")\n oauth_credentials: Optional[OauthCredentials] = Field(\n default=None,\n description=(\n "Databricks OAuth credentials for using a service principal. See"\n " https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0"\n ),\n )\n workspace_id: Optional[str] = Field(\n default=None,\n description=(\n "DEPRECATED: The Databricks workspace ID, as described in"\n " https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids."\n " This is no longer used and will be removed in a 0.21."\n ),\n )\n\n @root_validator()\n def has_token_or_oauth_credentials(cls, values):\n token = values.get("token")\n oauth_credentials = values.get("oauth_credentials")\n if not token and not oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials")\n if token and oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials, not both")\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatabricksClient:\n if self.oauth_credentials:\n client_id = self.oauth_credentials.client_id\n client_secret = self.oauth_credentials.client_secret\n else:\n client_id = None\n client_secret = None\n\n return DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=client_id,\n oauth_client_secret=client_secret,\n workspace_id=self.workspace_id,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatabricksClientResource.to_config_schema())\ndef databricks_client(init_context) -> DatabricksClient:\n return DatabricksClientResource.from_resource_context(init_context).get_client()
\n
", "current_page_name": "_modules/dagster_databricks/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.resources"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datadog.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datadog import DogStatsd, initialize, statsd\nfrom pydantic import Field\n\n\nclass DatadogClient:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key: str, app_key: str):\n        self.api_key = api_key\n        self.app_key = app_key\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]class DatadogResource(ConfigurableResource):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op\n def datadog_op(datadog_client: ResourceParam[DatadogClient]):\n datadog_client.event('Man down!', 'This server needs assistance.')\n datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n datadog_client.increment('page.views')\n datadog_client.decrement('page.views')\n datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n datadog_client.timing("query.response.time", 1234)\n\n # Use timed decorator\n @datadog_client.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job\n def job_for_datadog_op() -> None:\n datadog_op()\n\n job_for_datadog_op.execute_in_process(\n resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n )\n\n """\n\n api_key: str = Field(\n description=(\n "Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/"\n )\n )\n app_key: str = Field(\n description=(\n "Datadog application key. See"\n " https://docs.datadoghq.com/account_management/api-app-keys/."\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatadogClient:\n return DatadogClient(self.api_key, self.app_key)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DatadogResource.to_config_schema(),\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context) -> DatadogClient:\n """This legacy resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n Prefer using :py:class:`DatadogResource`.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DatadogResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datadog.resources"}}, "dagster_datahub": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datahub.resources

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import InitResourceContext, resource\nfrom dagster._config.pythonic_config import Config, ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datahub.emitter.kafka_emitter import (\n    DEFAULT_MCE_KAFKA_TOPIC,\n    DEFAULT_MCP_KAFKA_TOPIC,\n    MCE_KEY,\n    MCP_KEY,\n    DatahubKafkaEmitter,\n    KafkaEmitterConfig,\n)\nfrom datahub.emitter.rest_emitter import DatahubRestEmitter\nfrom pydantic import Field\n\n\n
[docs]class DatahubRESTEmitterResource(ConfigurableResource):\n connection: str = Field(description="Datahub GMS Server")\n token: Optional[str] = Field(default=None, description="Personal Access Token")\n connect_timeout_sec: Optional[float] = None\n read_timeout_sec: Optional[float] = None\n retry_status_codes: Optional[List[int]] = None\n retry_methods: Optional[List[str]] = None\n retry_max_times: Optional[int] = None\n extra_headers: Optional[Dict[str, str]] = None\n ca_certificate_path: Optional[str] = None\n server_telemetry_id: Optional[str] = None # No-op - no longer accepted in DatahubRestEmitter\n disable_ssl_verification: bool = False\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubRestEmitter:\n return DatahubRestEmitter(\n gms_server=self.connection,\n token=self.token,\n connect_timeout_sec=self.connect_timeout_sec,\n read_timeout_sec=self.read_timeout_sec,\n retry_status_codes=self.retry_status_codes,\n retry_methods=self.retry_methods,\n retry_max_times=self.retry_max_times,\n extra_headers=self.extra_headers,\n ca_certificate_path=self.ca_certificate_path,\n disable_ssl_verification=self.disable_ssl_verification,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubRESTEmitterResource.to_config_schema())\ndef datahub_rest_emitter(init_context: InitResourceContext) -> DatahubRestEmitter:\n emitter = DatahubRestEmitter(\n gms_server=init_context.resource_config.get("connection"),\n token=init_context.resource_config.get("token"),\n connect_timeout_sec=init_context.resource_config.get("connect_timeout_sec"),\n read_timeout_sec=init_context.resource_config.get("read_timeout_sec"),\n retry_status_codes=init_context.resource_config.get("retry_status_codes"),\n retry_methods=init_context.resource_config.get("retry_methods"),\n retry_max_times=init_context.resource_config.get("retry_max_times"),\n extra_headers=init_context.resource_config.get("extra_headers"),\n ca_certificate_path=init_context.resource_config.get("ca_certificate_path"),\n disable_ssl_verification=init_context.resource_config.get("disable_ssl_verification"),\n )\n # Attempt to hit the server to ensure the resource is properly configured\n emitter.test_connection()\n return emitter
\n\n\nclass DatahubConnection(Config):\n bootstrap: str = Field(description="Kafka Boostrap Servers. Comma delimited")\n schema_registry_url: str = Field(description="Schema Registry Location.")\n schema_registry_config: Dict[str, Any] = Field(\n default={}, description="Extra Schema Registry Config."\n )\n\n\n
[docs]class DatahubKafkaEmitterResource(ConfigurableResource):\n connection: DatahubConnection\n topic: Optional[str] = None\n topic_routes: Dict[str, str] = Field(\n default={\n MCE_KEY: DEFAULT_MCE_KAFKA_TOPIC,\n MCP_KEY: DEFAULT_MCP_KAFKA_TOPIC,\n }\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(\n KafkaEmitterConfig.parse_obj(\n {k: v for k, v in self._convert_to_config_dictionary().items() if v is not None}\n )\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubKafkaEmitterResource.to_config_schema())\ndef datahub_kafka_emitter(init_context: InitResourceContext) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(KafkaEmitterConfig.parse_obj(init_context.resource_config))
\n
", "current_page_name": "_modules/dagster_datahub/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datahub.resources"}}, "dagster_dbt": {"asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_decorator

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    BackfillPolicy,\n    DagsterInvalidDefinitionError,\n    Nothing,\n    PartitionsDefinition,\n    multi_asset,\n)\n\nfrom .asset_utils import (\n    DAGSTER_DBT_TRANSLATOR_METADATA_KEY,\n    MANIFEST_METADATA_KEY,\n    default_asset_check_fn,\n    default_code_version_fn,\n    get_deps,\n)\nfrom .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    output_name_fn,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]def dbt_assets(\n *,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Callable[..., AssetsDefinition]:\n """Create a definition for how to compute a set of dbt resources, described by a manifest.json.\n When invoking dbt commands using :py:class:`~dagster_dbt.DbtCliResource`'s\n :py:meth:`~dagster_dbt.DbtCliResource.cli` method, Dagster events are emitted by calling\n ``yield from`` on the event stream returned by :py:meth:`~dagster_dbt.DbtCliInvocation.stream`.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The contents of a manifest.json file\n or the path to a manifest.json file. A manifest.json contains a representation of a\n dbt project (models, tests, macros, etc). We use this representation to create\n corresponding Dagster assets.\n select (str): A dbt selection string for the models in a project that you want\n to include. Defaults to ``fqn:*``.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n backfill_policy (Optional[BackfillPolicy]): If a partitions_def is defined, this determines\n how to execute backfills that target multiple partitions.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the assets.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n\n Examples:\n Running ``dbt build`` for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Running dbt commands with flags:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n\n Running dbt commands with ``--vars``:\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_vars = {"key": "value"}\n\n yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n\n Retrieving dbt artifacts after running a dbt command:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_build_invocation = dbt.cli(["build"], context=context)\n\n yield from dbt_build_invocation.stream()\n\n run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n\n Running multiple dbt commands for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n yield from dbt.cli(["test"], context=context).stream()\n\n Customizing the Dagster asset metadata inferred from a dbt project using :py:class:`~dagster_dbt.DagsterDbtTranslator`:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n ...\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n dagster_dbt_translator=CustomDagsterDbtTranslator(),\n )\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Invoking another Dagster :py:class:`~dagster.ResourceDefinition` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n from dagster_slack import SlackResource\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n slack_client = slack.get_client()\n slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n\n Defining and accessing Dagster :py:class:`~dagster.Config` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext, Config\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class MyDbtConfig(Config):\n full_refresh: bool\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n dbt_build_args = ["build"]\n if config.full_refresh:\n dbt_build_args += ["--full-refresh"]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n Defining Dagster :py:class:`~dagster.PartitionDefinition` alongside dbt:\n\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext, DailyPartitionDefinition\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n )\n def partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n time_window = context.asset_partitions_time_window_for_output(\n list(context.selected_output_names)[0]\n )\n\n dbt_vars = {\n "min_date": time_window.start.isoformat(),\n "max_date": time_window.end.isoformat()\n }\n dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n """\n check.inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n additional_message=(\n "Ensure that the argument is an instantiated class that subclasses"\n " DagsterDbtTranslator."\n ),\n )\n manifest = validate_manifest(manifest)\n\n unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude or "", manifest_json=manifest\n )\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n deps = get_deps(\n dbt_nodes=node_info_by_dbt_unique_id,\n selected_unique_ids=unique_ids,\n asset_resource_types=ASSET_RESOURCE_TYPES,\n )\n (\n non_argument_deps,\n outs,\n internal_asset_deps,\n check_specs,\n ) = get_dbt_multi_asset_args(\n dbt_nodes=node_info_by_dbt_unique_id,\n deps=deps,\n io_manager_key=io_manager_key,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n )\n\n if op_tags and "dagster-dbt/select" in op_tags:\n raise DagsterInvalidDefinitionError(\n "To specify a dbt selection, use the 'select' argument, not 'dagster-dbt/select'"\n " with op_tags"\n )\n\n if op_tags and "dagster-dbt/exclude" in op_tags:\n raise DagsterInvalidDefinitionError(\n "To specify a dbt exclusion, use the 'exclude' argument, not 'dagster-dbt/exclude'"\n " with op_tags"\n )\n\n resolved_op_tags = {\n **({"dagster-dbt/select": select} if select else {}),\n **({"dagster-dbt/exclude": exclude} if exclude else {}),\n **(op_tags if op_tags else {}),\n }\n\n def inner(fn) -> AssetsDefinition:\n asset_definition = multi_asset(\n outs=outs,\n internal_asset_deps=internal_asset_deps,\n deps=non_argument_deps,\n compute_kind="dbt",\n partitions_def=partitions_def,\n can_subset=True,\n op_tags=resolved_op_tags,\n check_specs=check_specs,\n backfill_policy=backfill_policy,\n )(fn)\n\n return asset_definition\n\n return inner
\n\n\ndef get_dbt_multi_asset_args(\n dbt_nodes: Mapping[str, Any],\n deps: Mapping[str, FrozenSet[str]],\n io_manager_key: Optional[str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n) -> Tuple[\n Sequence[AssetKey],\n Dict[str, AssetOut],\n Dict[str, Set[AssetKey]],\n Sequence[AssetCheckSpec],\n]:\n non_argument_deps: Set[AssetKey] = set()\n outs: Dict[str, AssetOut] = {}\n internal_asset_deps: Dict[str, Set[AssetKey]] = {}\n check_specs: Sequence[AssetCheckSpec] = []\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n outs[output_name] = AssetOut(\n key=asset_key,\n dagster_type=Nothing,\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n is_required=False,\n metadata={ # type: ignore\n **dagster_dbt_translator.get_metadata(dbt_resource_props),\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest),\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n group_name=dagster_dbt_translator.get_group_name(dbt_resource_props),\n code_version=default_code_version_fn(dbt_resource_props),\n freshness_policy=dagster_dbt_translator.get_freshness_policy(dbt_resource_props),\n auto_materialize_policy=dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n ),\n )\n\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(\n asset_key, unique_id, dagster_dbt_translator.settings, test_resource_props\n )\n\n if check_spec:\n check_specs.append(check_spec)\n\n # Translate parent unique ids to internal asset deps and non argument dep\n output_internal_deps = internal_asset_deps.setdefault(output_name, set())\n for parent_unique_id in parent_unique_ids:\n parent_resource_props = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_resource_props)\n\n # Add this parent as an internal dependency\n output_internal_deps.add(parent_asset_key)\n\n # Mark this parent as an input if it has no dependencies\n if parent_unique_id not in deps:\n non_argument_deps.add(parent_asset_key)\n\n return list(non_argument_deps), outs, internal_asset_deps, check_specs\n
", "current_page_name": "_modules/dagster_dbt/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_decorator"}, "asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_defs

\nimport hashlib\nimport json\nimport os\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetCheckResult,\n    AssetKey,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    In,\n    OpExecutionContext,\n    Out,\n    PartitionsDefinition,\n    PermissiveConfig,\n    _check as check,\n    get_dagster_logger,\n    op,\n)\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKeyPrefix,\n    Output,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput, RawMetadataValue\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.core.resources import DbtCliClient\nfrom dagster_dbt.core.resources_v2 import DbtCliResource\nfrom dagster_dbt.core.types import DbtCliOutput\nfrom dagster_dbt.core.utils import build_command_args_from_flags, execute_cli\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\nfrom dagster_dbt.errors import DagsterDbtError\nfrom dagster_dbt.types import DbtOutput\nfrom dagster_dbt.utils import (\n    ASSET_RESOURCE_TYPES,\n    output_name_fn,\n    result_to_events,\n    select_unique_ids_from_manifest,\n)\n\n\ndef _load_manifest_for_project(\n    project_dir: str,\n    profiles_dir: str,\n    target_dir: str,\n    select: str,\n    exclude: str,\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "exclude": exclude,\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n        json_log_format=True,\n        capture_logs=True,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r", encoding="utf8") as f:\n        return json.load(f), cli_output\n\n\ndef _can_stream_events(dbt_resource: Union[DbtCliClient, DbtCliResource]) -> bool:\n    """Check if the installed dbt version supports streaming events."""\n    import dbt.version\n    from packaging import version\n\n    if version.parse(dbt.version.__version__) >= version.parse("1.4.0"):\n        # The json log format is required for streaming events. DbtCliResource always uses this format, but\n        # DbtCliClient has an option to disable it.\n        if isinstance(dbt_resource, DbtCliResource):\n            return True\n        else:\n            return dbt_resource._json_log_format  # noqa: SLF001\n    else:\n        return False\n\n\ndef _batch_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: DbtCliClient,\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n) -> Iterator[Union[AssetObservation, AssetMaterialization, Output]]:\n    """Yields events for a dbt cli invocation. Waits until the entire command has completed before\n    emitting outputs.\n    """\n    # clean up any run results from the last run\n    dbt_resource.remove_run_results_json()\n\n    dbt_output: Optional[DbtOutput] = None\n    try:\n        if use_build_command:\n            dbt_output = dbt_resource.build(**kwargs)\n        else:\n            dbt_output = dbt_resource.run(**kwargs)\n    finally:\n        # in the case that the project only partially runs successfully, still attempt to generate\n        # events for the parts that were successful\n        if dbt_output is None:\n            dbt_output = DbtOutput(result=check.not_none(dbt_resource.get_run_results_json()))\n\n        manifest_json = check.not_none(dbt_resource.get_manifest_json())\n\n        dbt_output = check.not_none(dbt_output)\n        for result in dbt_output.result["results"]:\n            extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None\n            if runtime_metadata_fn:\n                node_info = manifest_json["nodes"][result["unique_id"]]\n                extra_metadata = runtime_metadata_fn(context, node_info)\n            yield from result_to_events(\n                result=result,\n                docs_url=dbt_output.docs_url,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=manifest_json,\n                extra_metadata=extra_metadata,\n                generate_asset_outputs=True,\n            )\n\n\ndef _events_for_structured_json_line(\n    json_line: Mapping[str, Any],\n    context: OpExecutionContext,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output]]:\n    """Parses a json line into a Dagster event. Attempts to replicate the behavior of result_to_events\n    as closely as possible.\n    """\n    runtime_node_info = json_line.get("data", {}).get("node_info", {})\n    if not runtime_node_info:\n        return\n\n    node_resource_type = runtime_node_info.get("resource_type")\n    node_status = runtime_node_info.get("node_status")\n    unique_id = runtime_node_info.get("unique_id")\n\n    if not node_resource_type or not unique_id:\n        return\n\n    compiled_node_info = manifest_json["nodes"][unique_id]\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and node_status == "success":\n        metadata = dict(\n            runtime_metadata_fn(context, compiled_node_info) if runtime_metadata_fn else {}\n        )\n        started_at_str = runtime_node_info.get("node_started_at")\n        finished_at_str = runtime_node_info.get("node_finished_at")\n        if started_at_str is None or finished_at_str is None:\n            return\n\n        started_at = dateutil.parser.isoparse(started_at_str)  # type: ignore\n        completed_at = dateutil.parser.isoparse(finished_at_str)  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                "Execution Started At": started_at.isoformat(timespec="seconds"),\n                "Execution Completed At": completed_at.isoformat(timespec="seconds"),\n                "Execution Duration": duration.total_seconds(),\n            }\n        )\n        yield Output(\n            value=None,\n            output_name=output_name_fn(compiled_node_info),\n            metadata=metadata,\n        )\n    elif node_resource_type == "test" and runtime_node_info.get("node_finished_at"):\n        upstream_unique_ids = (\n            manifest_json["nodes"][unique_id].get("depends_on", {}).get("nodes", [])\n        )\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            upstream_node_info = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if upstream_node_info is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(upstream_node_info)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": unique_id,\n                    "Test Status": node_status,\n                },\n            )\n\n\ndef _stream_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: Union[DbtCliResource, DbtCliClient],\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output, AssetCheckResult]]:\n    """Yields events for a dbt cli invocation. Emits outputs as soon as the relevant dbt logs are\n    emitted.\n    """\n    if isinstance(dbt_resource, DbtCliClient):\n        for parsed_json_line in dbt_resource.cli_stream_json(\n            command="build" if use_build_command else "run",\n            **kwargs,\n        ):\n            yield from _events_for_structured_json_line(\n                parsed_json_line,\n                context,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                manifest_json,\n            )\n    else:\n        if runtime_metadata_fn is not None:\n            raise DagsterDbtError(\n                "The runtime_metadata_fn argument on the load_assets_from_dbt_manifest and"\n                " load_assets_from_dbt_project functions is not supported when using the"\n                " DbtCliResource resource. Use the @dbt_assets decorator instead if you want"\n                " control over what metadata is yielded at runtime."\n            )\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n                return node_info_to_asset_key(dbt_resource_props)\n\n        cli_output = dbt_resource.cli(\n            args=["build" if use_build_command else "run", *build_command_args_from_flags(kwargs)],\n            manifest=manifest_json,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n        )\n        yield from cli_output.stream()\n\n\nclass DbtOpConfig(PermissiveConfig):\n    """Keyword arguments to pass to the underlying dbt command. Additional arguments not listed in the schema will\n    be passed through as well, e.g. {'bool_flag': True, 'string_flag': 'hi'} will result in the flags\n    '--bool_flag --string_flag hi' being passed to the dbt command.\n    """\n\n    select: Optional[str] = None\n    exclude: Optional[str] = None\n    vars: Optional[Dict[str, Any]] = None\n    full_refresh: Optional[bool] = None\n\n\ndef _get_dbt_op(\n    op_name: str,\n    ins: Mapping[str, In],\n    outs: Mapping[str, Out],\n    select: str,\n    exclude: str,\n    use_build_command: bool,\n    fqns_by_output_name: Mapping[str, List[str]],\n    dbt_resource_key: str,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n):\n    @op(\n        name=op_name,\n        tags={"kind": "dbt"},\n        ins=ins,\n        out=outs,\n        required_resource_keys={dbt_resource_key},\n    )\n    def _dbt_op(context, config: DbtOpConfig):\n        dbt_resource: Union[DbtCliResource, DbtCliClient] = getattr(\n            context.resources, dbt_resource_key\n        )\n        check.inst(\n            dbt_resource,\n            (DbtCliResource, DbtCliClient),\n            "Resource with key 'dbt_resource_key' must be a DbtCliResource or DbtCliClient"\n            f" object, but is a {type(dbt_resource)}",\n        )\n\n        kwargs: Dict[str, Any] = {}\n        # in the case that we're running everything, opt for the cleaner selection string\n        if len(context.selected_output_names) == len(outs):\n            kwargs["select"] = select\n            kwargs["exclude"] = exclude\n        else:\n            # for each output that we want to emit, translate to a dbt select string by converting\n            # the out to its corresponding fqn\n            kwargs["select"] = [\n                ".".join(fqns_by_output_name[output_name])\n                for output_name in context.selected_output_names\n            ]\n        # variables to pass into the command\n        if partition_key_to_vars_fn:\n            kwargs["vars"] = partition_key_to_vars_fn(context.partition_key)\n        # merge in any additional kwargs from the config\n        kwargs = deep_merge_dicts(kwargs, context.op_config)\n\n        if _can_stream_events(dbt_resource):\n            yield from _stream_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n                manifest_json=manifest_json,\n            )\n        else:\n            if not isinstance(dbt_resource, DbtCliClient):\n                check.failed(\n                    "Chose batch event iterator, but it only works with DbtCliClient, and"\n                    f" resource has type {type(dbt_resource)}"\n                )\n            yield from _batch_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n            )\n\n    return _dbt_op\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    exclude: str,\n    selected_unique_ids: AbstractSet[str],\n    project_id: str,\n    dbt_resource_key: str,\n    manifest_json: Mapping[str, Any],\n    op_name: Optional[str],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    io_manager_key: Optional[str],\n    use_build_command: bool,\n    partitions_def: Optional[PartitionsDefinition],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    dagster_dbt_translator: DagsterDbtTranslator,\n) -> AssetsDefinition:\n    if use_build_command:\n        deps = get_deps(\n            dbt_nodes,\n            selected_unique_ids,\n            asset_resource_types=["model", "seed", "snapshot"],\n        )\n    else:\n        deps = get_deps(dbt_nodes, selected_unique_ids, asset_resource_types=["model"])\n\n    (\n        asset_deps,\n        asset_ins,\n        asset_outs,\n        group_names_by_key,\n        freshness_policies_by_key,\n        auto_materialize_policies_by_key,\n        check_specs_by_output_name,\n        fqns_by_output_name,\n        _,\n    ) = get_asset_deps(\n        dbt_nodes=dbt_nodes,\n        deps=deps,\n        io_manager_key=io_manager_key,\n        manifest=manifest_json,\n        dagster_dbt_translator=dagster_dbt_translator,\n    )\n\n    # prevent op name collisions between multiple dbt multi-assets\n    if not op_name:\n        op_name = f"run_dbt_{project_id}"\n        if select != "fqn:*" or exclude:\n            op_name += "_" + hashlib.md5(select.encode() + exclude.encode()).hexdigest()[-5:]\n\n    check_outs_by_output_name: Mapping[str, Out] = {}\n    if check_specs_by_output_name:\n        check_outs_by_output_name = {\n            output_name: Out(dagster_type=None, is_required=False)\n            for output_name in check_specs_by_output_name.keys()\n        }\n\n    dbt_op = _get_dbt_op(\n        op_name=op_name,\n        ins=dict(asset_ins.values()),\n        outs={\n            **dict(asset_outs.values()),\n            **check_outs_by_output_name,\n        },\n        select=select,\n        exclude=exclude,\n        use_build_command=use_build_command,\n        fqns_by_output_name=fqns_by_output_name,\n        dbt_resource_key=dbt_resource_key,\n        node_info_to_asset_key=dagster_dbt_translator.get_asset_key,\n        partition_key_to_vars_fn=partition_key_to_vars_fn,\n        runtime_metadata_fn=runtime_metadata_fn,\n        manifest_json=manifest_json,\n    )\n\n    return AssetsDefinition(\n        keys_by_input_name={\n            input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n        },\n        keys_by_output_name={\n            output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n        },\n        node_def=dbt_op,\n        can_subset=True,\n        asset_deps=asset_deps,\n        group_names_by_key=group_names_by_key,\n        freshness_policies_by_key=freshness_policies_by_key,\n        auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n        check_specs_by_output_name=check_specs_by_output_name,\n        partitions_def=partitions_def,\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n io_manager_key: Optional[str] = None,\n target_dir: Optional[str] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n op_name: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models from a dbt project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` or `dbt build` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n project_dir (Optional[str]): The directory containing the dbt project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where dbt will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n manifest_json (Optional[Mapping[str, Any]]): [Deprecated] Use the manifest argument instead.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model. Deprecated: instead,\n provide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n project_dir = check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n select = check.opt_str_param(select, "select", "fqn:*")\n exclude = check.opt_str_param(exclude, "exclude", "")\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=None,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n manifest, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select, exclude\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n dagster_dbt_translator=dagster_dbt_translator,\n op_name=op_name,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n )
\n\n\n
[docs]@deprecated_param(\n param="manifest_json", breaking_version="0.21", additional_warn_text="Use manifest instead"\n)\n@deprecated_param(\n param="selected_unique_ids",\n breaking_version="0.21",\n additional_warn_text="Use the select parameter instead.",\n)\n@deprecated_param(\n param="dbt_resource_key",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize your resource key."\n ),\n)\n@deprecated_param(\n param="use_build_command",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize the underlying dbt commands."\n ),\n)\n@deprecated_param(\n param="partitions_def",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="partition_key_to_vars_fn",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="runtime_metadata_fn",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize runtime metadata."\n ),\n)\ndef load_assets_from_dbt_manifest(\n manifest: Optional[Union[Path, Mapping[str, Any]]] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n op_name: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n manifest (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n manifest = normalize_renamed_param(\n manifest,\n "manifest",\n manifest_json,\n "manifest_json",\n )\n manifest = cast(\n Union[Mapping[str, Any], Path], check.inst_param(manifest, "manifest", (Path, dict))\n )\n if isinstance(manifest, Path):\n manifest = cast(Mapping[str, Any], json.loads(manifest.read_bytes()))\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n io_manager_key=io_manager_key,\n dagster_dbt_translator=dagster_dbt_translator,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n selected_unique_ids=selected_unique_ids,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )
\n\n\ndef _load_assets_from_dbt_manifest(\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n io_manager_key: Optional[str],\n dagster_dbt_translator: Optional[DagsterDbtTranslator],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n selected_unique_ids: Optional[AbstractSet[str]],\n display_raw_sql: Optional[bool],\n dbt_resource_key: str,\n op_name: Optional[str],\n use_build_command: bool,\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n) -> Sequence[AssetsDefinition]:\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n dbt_resource_key = check.str_param(dbt_resource_key, "dbt_resource_key")\n\n dbt_nodes = {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["metrics"],\n **manifest["exposures"],\n }\n\n if selected_unique_ids:\n select = (\n " ".join(".".join(dbt_nodes[uid]["fqn"]) for uid in selected_unique_ids)\n if select is None\n else select\n )\n exclude = "" if exclude is None else exclude\n else:\n select = select if select is not None else "fqn:*"\n exclude = exclude if exclude is not None else ""\n\n selected_unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude, manifest_json=manifest\n )\n if len(selected_unique_ids) == 0:\n raise DagsterInvalidSubsetError(f"No dbt models match the selection string '{select}'.")\n\n if dagster_dbt_translator is not None:\n check.invariant(\n node_info_to_asset_key == default_asset_key_fn,\n "Can't specify both dagster_dbt_translator and node_info_to_asset_key",\n )\n check.invariant(\n key_prefix is None,\n "Can't specify both dagster_dbt_translator and key_prefix",\n )\n check.invariant(\n source_key_prefix is None,\n "Can't specify both dagster_dbt_translator and source_key_prefix",\n )\n check.invariant(\n node_info_to_group_fn == default_group_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_group_fn",\n )\n check.invariant(\n display_raw_sql is None,\n "Can't specify both dagster_dbt_translator and display_raw_sql",\n )\n check.invariant(\n node_info_to_definition_metadata_fn is default_metadata_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_definition_metadata_fn",\n )\n else:\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props):\n base_key = node_info_to_asset_key(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(source_key_prefix or [])\n else:\n return base_key.with_prefix(key_prefix or [])\n\n @classmethod\n def get_metadata(cls, dbt_resource_props):\n return node_info_to_definition_metadata_fn(dbt_resource_props)\n\n @classmethod\n def get_description(cls, dbt_resource_props):\n return default_description_fn(\n dbt_resource_props,\n display_raw_sql=display_raw_sql if display_raw_sql is not None else True,\n )\n\n @classmethod\n def get_group_name(cls, dbt_resource_props):\n return node_info_to_group_fn(dbt_resource_props)\n\n @classmethod\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n return node_info_to_freshness_policy_fn(dbt_resource_props)\n\n @classmethod\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n return node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n dagster_dbt_translator = CustomDagsterDbtTranslator()\n\n dbt_assets_def = _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n exclude=exclude,\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n project_id=manifest["metadata"]["project_id"][:5],\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n dagster_dbt_translator=dagster_dbt_translator,\n manifest_json=manifest,\n )\n\n return [dbt_assets_def]\n\n\ndef _raise_warnings_for_deprecated_args(\n public_fn_name: str,\n selected_unique_ids: Optional[AbstractSet[str]],\n dbt_resource_key: Optional[str],\n use_build_command: Optional[bool],\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n):\n if node_info_to_asset_key != default_asset_key_fn:\n deprecation_warning(\n f"The node_info_to_asset_key_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_asset_key.",\n stacklevel=4,\n )\n\n if node_info_to_group_fn != default_group_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_group_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure dagster groups on a dbt resource's meta field or assign dbt"\n " groups or provide a custom DagsterDbtTranslator that overrides get_group_name.",\n stacklevel=4,\n )\n\n if node_info_to_auto_materialize_policy_fn != default_auto_materialize_policy_fn:\n deprecation_warning(\n f"The node_info_to_auto_materialize_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster auto-materialize policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_freshness_policy_fn != default_freshness_policy_fn:\n deprecation_warning(\n f"The node_info_to_freshness_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster freshness policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_definition_metadata_fn != default_metadata_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_definition_metadata_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_metadata.",\n stacklevel=4,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_defs"}, "asset_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_utils

\nimport hashlib\nimport textwrap\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetsDefinition,\n    AssetSelection,\n    AutoMaterializePolicy,\n    DagsterInvariantViolationError,\n    FreshnessPolicy,\n    In,\n    MetadataValue,\n    Nothing,\n    Out,\n    RunConfig,\n    ScheduleDefinition,\n    TableColumn,\n    TableSchema,\n    _check as check,\n    define_asset_job,\n)\nfrom dagster._core.definitions.decorators.asset_decorator import (\n    _validate_and_assign_output_names_to_check_specs,\n)\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import deprecation_warning\n\nfrom .utils import input_name_fn, output_name_fn\n\nif TYPE_CHECKING:\n    from .dagster_dbt_translator import (\n        DagsterDbtTranslator,\n        DagsterDbtTranslatorSettings,\n        DbtManifestWrapper,\n    )\n\nMANIFEST_METADATA_KEY = "dagster_dbt/manifest"\nDAGSTER_DBT_TRANSLATOR_METADATA_KEY = "dagster_dbt/dagster_dbt_translator"\n\n\n
[docs]def get_asset_key_for_model(dbt_assets: Sequence[AssetsDefinition], model_name: str) -> AssetKey:\n """Return the corresponding Dagster asset key for a dbt model.\n\n Args:\n dbt_assets (AssetsDefinition): An AssetsDefinition object produced by\n load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.\n model_name (str): The name of the dbt model.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_model\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n\n @asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\n def cleaned_customers():\n ...\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(model_name, "model_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_models = [\n value\n for value in manifest["nodes"].values()\n if value["name"] == model_name and value["resource_type"] == "model"\n ]\n\n if len(matching_models) == 0:\n raise KeyError(f"Could not find a dbt model with name: {model_name}")\n\n return dagster_dbt_translator.get_asset_key(next(iter(matching_models)))
\n\n\n
[docs]def get_asset_keys_by_output_name_for_source(\n dbt_assets: Sequence[AssetsDefinition], source_name: str\n) -> Mapping[str, AssetKey]:\n """Returns the corresponding Dagster asset keys for all tables in a dbt source.\n\n This is a convenience method that makes it easy to define a multi-asset that generates\n all the tables for a given dbt source.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Returns:\n Mapping[str, AssetKey]: A mapping of the table name to corresponding Dagster asset key\n for all tables in the given dbt source.\n\n Examples:\n .. code-block:: python\n\n from dagster import AssetOut, multi_asset\n from dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @multi_asset(\n outs={\n name: AssetOut(key=asset_key)\n for name, asset_key in get_asset_keys_by_output_name_for_source(\n [all_dbt_assets], "raw_data"\n ).items()\n },\n )\n def upstream_python_asset():\n ...\n\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(source_name, "source_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_nodes = [\n value for value in manifest["sources"].values() if value["source_name"] == source_name\n ]\n\n if len(matching_nodes) == 0:\n raise KeyError(f"Could not find a dbt source with name: {source_name}")\n\n return {\n output_name_fn(value): dagster_dbt_translator.get_asset_key(value)\n for value in matching_nodes\n }
\n\n\n
[docs]def get_asset_key_for_source(dbt_assets: Sequence[AssetsDefinition], source_name: str) -> AssetKey:\n """Returns the corresponding Dagster asset key for a dbt source with a singular table.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Raises:\n DagsterInvalidInvocationError: If the source has more than one table.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\n def upstream_python_asset():\n ...\n """\n asset_keys_by_output_name = get_asset_keys_by_output_name_for_source(dbt_assets, source_name)\n\n if len(asset_keys_by_output_name) > 1:\n raise KeyError(\n f"Source {source_name} has more than one table:"\n f" {asset_keys_by_output_name.values()}. Use"\n " `get_asset_keys_by_output_name_for_source` instead to get all tables for a"\n " source."\n )\n\n return next(iter(asset_keys_by_output_name.values()))
\n\n\n
[docs]def build_dbt_asset_selection(\n dbt_assets: Sequence[AssetsDefinition],\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n) -> AssetSelection:\n """Build an asset selection for a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Returns:\n AssetSelection: An asset selection for the selected dbt nodes.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n # Select the dbt assets that have the tag "foo".\n foo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n # Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n # of them (dbt-related or otherwise)\n foo_and_downstream_selection = foo_selection.downstream()\n\n """\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n from .dbt_manifest_asset_selection import DbtManifestAssetSelection\n\n return DbtManifestAssetSelection(\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n select=dbt_select,\n exclude=dbt_exclude,\n )
\n\n\n
[docs]def build_schedule_from_dbt_selection(\n dbt_assets: Sequence[AssetsDefinition],\n job_name: str,\n cron_schedule: str,\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n config: Optional[RunConfig] = None,\n execution_timezone: Optional[str] = None,\n) -> ScheduleDefinition:\n """Build a schedule to materialize a specified set of dbt resources from a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n job_name (str): The name of the job to materialize the dbt resources.\n cron_schedule (str): The cron schedule to define the schedule.\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n config (Optional[RunConfig]): The config that parameterizes the execution of this schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n\n Returns:\n ScheduleDefinition: A definition to materialize the selected dbt resources on a cron schedule.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n daily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n [all_dbt_assets],\n job_name="all_dbt_assets",\n cron_schedule="0 0 * * *",\n dbt_select="fqn:*",\n )\n """\n return ScheduleDefinition(\n cron_schedule=cron_schedule,\n job=define_asset_job(\n name=job_name,\n selection=build_dbt_asset_selection(\n dbt_assets,\n dbt_select=dbt_select,\n dbt_exclude=dbt_exclude,\n ),\n config=config,\n tags=tags,\n ),\n execution_timezone=execution_timezone,\n )
\n\n\ndef get_manifest_and_translator_from_dbt_assets(\n dbt_assets: Sequence[AssetsDefinition],\n) -> Tuple[Mapping[str, Any], "DagsterDbtTranslator"]:\n check.invariant(len(dbt_assets) == 1, "Exactly one dbt AssetsDefinition is required")\n dbt_assets_def = dbt_assets[0]\n metadata_by_key = dbt_assets_def.metadata_by_key or {}\n first_asset_key = next(iter(dbt_assets_def.metadata_by_key.keys()))\n first_metadata = metadata_by_key.get(first_asset_key, {})\n manifest_wrapper: Optional["DbtManifestWrapper"] = first_metadata.get(MANIFEST_METADATA_KEY)\n if manifest_wrapper is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt manifest metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n dagster_dbt_translator = first_metadata.get(DAGSTER_DBT_TRANSLATOR_METADATA_KEY)\n if dagster_dbt_translator is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt translator metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n return manifest_wrapper.manifest, dagster_dbt_translator\n\n\n###################\n# DEFAULT FUNCTIONS\n###################\n\n\ndef default_asset_key_fn(dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """Get the asset key for a dbt node.\n\n By default, if the dbt node has a Dagster asset key configured in its metadata, then that is\n parsed and used.\n\n Otherwise:\n dbt sources: a dbt source's key is the union of its source name and its table name\n dbt models: a dbt model's key is the union of its model name and any schema configured on\n the model itself.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n asset_key_config = dagster_metadata.get("asset_key", [])\n if asset_key_config:\n return AssetKey(asset_key_config)\n\n if dbt_resource_props["resource_type"] == "source":\n components = [dbt_resource_props["source_name"], dbt_resource_props["name"]]\n else:\n configured_schema = dbt_resource_props["config"].get("schema")\n if configured_schema is not None:\n components = [configured_schema, dbt_resource_props["name"]]\n else:\n components = [dbt_resource_props["name"]]\n\n return AssetKey(components)\n\n\n
[docs]def default_metadata_from_dbt_resource_props(\n dbt_resource_props: Mapping[str, Any]\n) -> Mapping[str, Any]:\n metadata: Dict[str, Any] = {}\n columns = dbt_resource_props.get("columns", {})\n if len(columns) > 0:\n metadata["table_schema"] = MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(\n name=column_name,\n type=column_info.get("data_type") or "?",\n description=column_info.get("description"),\n )\n for column_name, column_info in columns.items()\n ]\n )\n )\n return metadata
\n\n\n
[docs]def default_group_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """Get the group name for a dbt node.\n\n If a Dagster group is configured in the metadata for the node, use that.\n\n Otherwise, if a dbt group is configured for the node, use that.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n\n dagster_group = dagster_metadata.get("group")\n if dagster_group:\n return dagster_group\n\n dbt_group = dbt_resource_props.get("config", {}).get("group")\n if dbt_group:\n return dbt_group\n\n return None
\n\n\n
[docs]def group_from_dbt_resource_props_fallback_to_directory(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[str]:\n """Get the group name for a dbt node.\n\n Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\n from config or metadata, falls back to using the subdirectory of the models directory that the\n source file is in.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\n dbt_assets = load_assets_from_dbt_manifest(\n manifest=manifest,\n node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n )\n """\n group_name = default_group_from_dbt_resource_props(dbt_resource_props)\n if group_name is not None:\n return group_name\n\n fqn = dbt_resource_props.get("fqn", [])\n # the first component is the package name, and the last component is the model name\n if len(fqn) < 3:\n return None\n return fqn[1]
\n\n\ndef default_freshness_policy_fn(dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n freshness_policy_config = dagster_metadata.get("freshness_policy", {})\n\n freshness_policy = _legacy_freshness_policy_fn(freshness_policy_config)\n if freshness_policy:\n return freshness_policy\n\n legacy_freshness_policy_config = dbt_resource_props["config"].get(\n "dagster_freshness_policy", {}\n )\n legacy_freshness_policy = _legacy_freshness_policy_fn(legacy_freshness_policy_config)\n\n if legacy_freshness_policy:\n deprecation_warning(\n "dagster_freshness_policy",\n "0.21.0",\n "Instead, configure a Dagster freshness policy on a dbt model using"\n " +meta.dagster.freshness_policy.",\n )\n\n return legacy_freshness_policy\n\n\ndef _legacy_freshness_policy_fn(\n freshness_policy_config: Mapping[str, Any]\n) -> Optional[FreshnessPolicy]:\n if freshness_policy_config:\n return FreshnessPolicy(\n maximum_lag_minutes=float(freshness_policy_config["maximum_lag_minutes"]),\n cron_schedule=freshness_policy_config.get("cron_schedule"),\n cron_schedule_timezone=freshness_policy_config.get("cron_schedule_timezone"),\n )\n return None\n\n\ndef default_auto_materialize_policy_fn(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n auto_materialize_policy_config = dagster_metadata.get("auto_materialize_policy", {})\n\n auto_materialize_policy = _auto_materialize_policy_fn(auto_materialize_policy_config)\n if auto_materialize_policy:\n return auto_materialize_policy\n\n legacy_auto_materialize_policy_config = dbt_resource_props["config"].get(\n "dagster_auto_materialize_policy", {}\n )\n legacy_auto_materialize_policy = _auto_materialize_policy_fn(\n legacy_auto_materialize_policy_config\n )\n\n if legacy_auto_materialize_policy:\n deprecation_warning(\n "dagster_auto_materialize_policy",\n "0.21.0",\n "Instead, configure a Dagster auto-materialize policy on a dbt model using"\n " +meta.dagster.auto_materialize_policy.",\n )\n\n return legacy_auto_materialize_policy\n\n\ndef _auto_materialize_policy_fn(\n auto_materialize_policy_config: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n if auto_materialize_policy_config.get("type") == "eager":\n return AutoMaterializePolicy.eager()\n elif auto_materialize_policy_config.get("type") == "lazy":\n return AutoMaterializePolicy.lazy()\n return None\n\n\ndef default_description_fn(dbt_resource_props: Mapping[str, Any], display_raw_sql: bool = True):\n code_block = textwrap.indent(\n dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", ""), " "\n )\n description_sections = [\n dbt_resource_props["description"]\n or f"dbt {dbt_resource_props['resource_type']} {dbt_resource_props['name']}",\n ]\n if display_raw_sql:\n description_sections.append(f"#### Raw SQL:\\n```\\n{code_block}\\n```")\n return "\\n\\n".join(filter(None, description_sections))\n\n\ndef is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> bool:\n attached_node_unique_id = dbt_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n return is_generic_test and attached_node_unique_id == unique_id\n\n\ndef default_asset_check_fn(\n asset_key: AssetKey,\n unique_id: str,\n dagster_dbt_translator_settings: "DagsterDbtTranslatorSettings",\n dbt_resource_props: Mapping[str, Any],\n) -> Optional[AssetCheckSpec]:\n is_generic_test_on_attached_node = is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id, dbt_resource_props\n )\n\n if not all(\n [\n dagster_dbt_translator_settings.enable_asset_checks,\n is_generic_test_on_attached_node,\n ]\n ):\n return None\n\n return AssetCheckSpec(\n name=dbt_resource_props["name"],\n asset=asset_key,\n description=dbt_resource_props["description"],\n )\n\n\ndef default_code_version_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n return hashlib.sha1(\n (dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", "")).encode(\n "utf-8"\n )\n ).hexdigest()\n\n\n###################\n# DEPENDENCIES\n###################\n\n\ndef is_non_asset_node(dbt_resource_props: Mapping[str, Any]):\n # some nodes exist inside the dbt graph but are not assets\n resource_type = dbt_resource_props["resource_type"]\n if resource_type == "metric":\n return True\n if (\n resource_type == "model"\n and dbt_resource_props.get("config", {}).get("materialized") == "ephemeral"\n ):\n return True\n return False\n\n\ndef get_deps(\n dbt_nodes: Mapping[str, Any],\n selected_unique_ids: AbstractSet[str],\n asset_resource_types: List[str],\n) -> Mapping[str, FrozenSet[str]]:\n def _valid_parent_node(dbt_resource_props):\n # sources are valid parents, but not assets\n return dbt_resource_props["resource_type"] in asset_resource_types + ["source"]\n\n asset_deps: Dict[str, Set[str]] = {}\n for unique_id in selected_unique_ids:\n dbt_resource_props = dbt_nodes[unique_id]\n node_resource_type = dbt_resource_props["resource_type"]\n\n # skip non-assets, such as metrics, tests, and ephemeral models\n if is_non_asset_node(dbt_resource_props) or node_resource_type not in asset_resource_types:\n continue\n\n asset_deps[unique_id] = set()\n for parent_unique_id in dbt_resource_props.get("depends_on", {}).get("nodes", []):\n parent_node_info = dbt_nodes[parent_unique_id]\n # for metrics or ephemeral dbt models, BFS to find valid parents\n if is_non_asset_node(parent_node_info):\n visited = set()\n replaced_parent_ids = set()\n # make a copy to avoid mutating the actual dictionary\n queue = list(parent_node_info.get("depends_on", {}).get("nodes", []))\n while queue:\n candidate_parent_id = queue.pop()\n if candidate_parent_id in visited:\n continue\n visited.add(candidate_parent_id)\n\n candidate_parent_info = dbt_nodes[candidate_parent_id]\n if is_non_asset_node(candidate_parent_info):\n queue.extend(candidate_parent_info.get("depends_on", {}).get("nodes", []))\n elif _valid_parent_node(candidate_parent_info):\n replaced_parent_ids.add(candidate_parent_id)\n\n asset_deps[unique_id] |= replaced_parent_ids\n # ignore nodes which are not assets / sources\n elif _valid_parent_node(parent_node_info):\n asset_deps[unique_id].add(parent_unique_id)\n\n frozen_asset_deps = {\n unique_id: frozenset(parent_ids) for unique_id, parent_ids in asset_deps.items()\n }\n\n return frozen_asset_deps\n\n\ndef get_asset_deps(\n dbt_nodes,\n deps,\n io_manager_key,\n manifest: Optional[Mapping[str, Any]],\n dagster_dbt_translator: "DagsterDbtTranslator",\n) -> Tuple[\n Dict[AssetKey, Set[AssetKey]],\n Dict[AssetKey, Tuple[str, In]],\n Dict[AssetKey, Tuple[str, Out]],\n Dict[AssetKey, str],\n Dict[AssetKey, FreshnessPolicy],\n Dict[AssetKey, AutoMaterializePolicy],\n Dict[str, AssetCheckSpec],\n Dict[str, List[str]],\n Dict[str, Dict[str, Any]],\n]:\n from .dagster_dbt_translator import DbtManifestWrapper\n\n asset_deps: Dict[AssetKey, Set[AssetKey]] = {}\n asset_ins: Dict[AssetKey, Tuple[str, In]] = {}\n asset_outs: Dict[AssetKey, Tuple[str, Out]] = {}\n\n # These dicts could be refactored as a single dict, mapping from output name to arbitrary\n # metadata that we need to store for reference.\n group_names_by_key: Dict[AssetKey, str] = {}\n freshness_policies_by_key: Dict[AssetKey, FreshnessPolicy] = {}\n auto_materialize_policies_by_key: Dict[AssetKey, AutoMaterializePolicy] = {}\n check_specs: List[AssetCheckSpec] = []\n fqns_by_output_name: Dict[str, List[str]] = {}\n metadata_by_output_name: Dict[str, Dict[str, Any]] = {}\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n fqns_by_output_name[output_name] = dbt_resource_props["fqn"]\n\n metadata_by_output_name[output_name] = {\n key: dbt_resource_props[key] for key in ["unique_id", "resource_type"]\n }\n\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n asset_deps[asset_key] = set()\n\n metadata = merge_dicts(\n dagster_dbt_translator.get_metadata(dbt_resource_props),\n {\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest) if manifest else None,\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n )\n asset_outs[asset_key] = (\n output_name,\n Out(\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n metadata=metadata,\n is_required=False,\n dagster_type=Nothing,\n code_version=default_code_version_fn(dbt_resource_props),\n ),\n )\n\n group_name = dagster_dbt_translator.get_group_name(dbt_resource_props)\n if group_name is not None:\n group_names_by_key[asset_key] = group_name\n\n freshness_policy = dagster_dbt_translator.get_freshness_policy(dbt_resource_props)\n if freshness_policy is not None:\n freshness_policies_by_key[asset_key] = freshness_policy\n\n auto_materialize_policy = dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n )\n if auto_materialize_policy is not None:\n auto_materialize_policies_by_key[asset_key] = auto_materialize_policy\n\n test_unique_ids = []\n if manifest:\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(\n asset_key, unique_id, dagster_dbt_translator.settings, test_resource_props\n )\n\n if check_spec:\n check_specs.append(check_spec)\n\n for parent_unique_id in parent_unique_ids:\n parent_node_info = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_node_info)\n\n asset_deps[asset_key].add(parent_asset_key)\n\n # if this parent is not one of the selected nodes, it's an input\n if parent_unique_id not in deps:\n input_name = input_name_fn(parent_node_info)\n asset_ins[parent_asset_key] = (input_name, In(Nothing))\n\n check_specs_by_output_name = cast(\n Dict[str, AssetCheckSpec],\n _validate_and_assign_output_names_to_check_specs(check_specs, list(asset_outs.keys())),\n )\n\n return (\n asset_deps,\n asset_ins,\n asset_outs,\n group_names_by_key,\n freshness_policies_by_key,\n auto_materialize_policies_by_key,\n check_specs_by_output_name,\n fqns_by_output_name,\n metadata_by_output_name,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_utils"}, "cloud": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.asset_defs

\nimport json\nimport shlex\nfrom argparse import Namespace\nfrom contextlib import suppress\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetExecutionContext,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    MetadataValue,\n    PartitionsDefinition,\n    ResourceDefinition,\n    multi_asset,\n    with_resources,\n)\nfrom dagster._annotations import experimental, experimental_param\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\n\nfrom ..errors import DagsterDbtCloudJobInvariantViolationError\nfrom ..utils import ASSET_RESOURCE_TYPES, result_to_events\nfrom .resources import DbtCloudClient, DbtCloudClientResource, DbtCloudRunStatus\n\nDAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR = "DBT_DAGSTER_COMPILE_RUN_ID"\n\n\nclass DbtCloudCacheableAssetsDefinition(CacheableAssetsDefinition):\n    def __init__(\n        self,\n        dbt_cloud_resource_def: Union[DbtCloudClientResource, ResourceDefinition],\n        job_id: int,\n        node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n        node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n        node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n        node_info_to_auto_materialize_policy_fn: Callable[\n            [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n        ],\n        partitions_def: Optional[PartitionsDefinition] = None,\n        partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n    ):\n        self._dbt_cloud_resource_def: ResourceDefinition = (\n            dbt_cloud_resource_def.get_resource_definition()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def\n        )\n\n        self._dbt_cloud: DbtCloudClient = (\n            dbt_cloud_resource_def.process_config_and_initialize().get_dbt_client()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def(build_init_resource_context())\n        )\n        self._job_id = job_id\n        self._project_id: int\n        self._has_generate_docs: bool\n        self._job_commands: List[str]\n        self._job_materialization_command_step: int\n        self._node_info_to_asset_key = node_info_to_asset_key\n        self._node_info_to_group_fn = node_info_to_group_fn\n        self._node_info_to_freshness_policy_fn = node_info_to_freshness_policy_fn\n        self._node_info_to_auto_materialize_policy_fn = node_info_to_auto_materialize_policy_fn\n        self._partitions_def = partitions_def\n        self._partition_key_to_vars_fn = partition_key_to_vars_fn\n\n        super().__init__(unique_id=f"dbt-cloud-{job_id}")\n\n    def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n        dbt_nodes, dbt_dependencies = self._get_dbt_nodes_and_dependencies()\n        return [self._build_dbt_cloud_assets_cacheable_data(dbt_nodes, dbt_dependencies)]\n\n    def build_definitions(\n        self, data: Sequence[AssetsDefinitionCacheableData]\n    ) -> Sequence[AssetsDefinition]:\n        return with_resources(\n            [\n                self._build_dbt_cloud_assets_from_cacheable_data(assets_definition_metadata)\n                for assets_definition_metadata in data\n            ],\n            {"dbt_cloud": self._dbt_cloud_resource_def},\n        )\n\n    @staticmethod\n    def parse_dbt_command(dbt_command: str) -> Namespace:\n        args = shlex.split(dbt_command)[1:]\n        try:\n            from dbt.cli.flags import (\n                Flags,\n                args_to_context,\n            )\n\n            # nasty hack to get dbt to parse the args\n            # dbt >= 1.5.0 requires that profiles-dir is set to an existing directory\n            return Namespace(**vars(Flags(args_to_context(args + ["--profiles-dir", "."]))))\n        except ImportError:\n            # dbt < 1.5.0 compat\n            from dbt.main import parse_args  # type: ignore\n\n            return parse_args(args=args)\n\n    @staticmethod\n    def get_job_materialization_command_step(execute_steps: List[str]) -> int:\n        materialization_command_filter = [\n            DbtCloudCacheableAssetsDefinition.parse_dbt_command(command).which in ["run", "build"]\n            for command in execute_steps\n        ]\n\n        if sum(materialization_command_filter) != 1:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                "The dbt Cloud job must have a single `dbt run` or `dbt build` in its commands. "\n                f"Received commands: {execute_steps}."\n            )\n\n        return materialization_command_filter.index(True)\n\n    @staticmethod\n    def get_compile_filters(parsed_args: Namespace) -> List[str]:\n        dbt_compile_options: List[str] = []\n\n        selected_models = parsed_args.select or []\n        if selected_models:\n            dbt_compile_options.append(f"--select {' '.join(selected_models)}")\n\n        excluded_models = parsed_args.exclude or []\n        if excluded_models:\n            dbt_compile_options.append(f"--exclude {' '.join(excluded_models)}")\n\n        selector = getattr(parsed_args, "selector_name", None) or getattr(\n            parsed_args, "selector", None\n        )\n        if selector:\n            dbt_compile_options.append(f"--selector {selector}")\n\n        return dbt_compile_options\n\n    def _get_cached_compile_dbt_cloud_job_run(self, compile_run_id: int) -> Tuple[int, int]:\n        # If the compile run is ongoing, allow it a grace period of 10 minutes to finish.\n        with suppress(Exception):\n            self._dbt_cloud.poll_run(run_id=compile_run_id, poll_timeout=600)\n\n        compile_run = self._dbt_cloud.get_run(\n            run_id=compile_run_id, include_related=["trigger", "run_steps"]\n        )\n\n        compile_run_status: str = compile_run["status_humanized"]\n        if compile_run_status != DbtCloudRunStatus.SUCCESS:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The cached dbt Cloud job run `{compile_run_id}` must have a status of"\n                f" `{DbtCloudRunStatus.SUCCESS}`. Received status: `{compile_run_status}. You can"\n                f" view the full status of your dbt Cloud run at {compile_run['href']}. Once it has"\n                " successfully completed, reload your Dagster definitions. If your run has failed,"\n                " you must manually refresh the cache using the `dagster-dbt"\n                " cache-compile-references` CLI."\n            )\n\n        compile_run_has_generate_docs = compile_run["trigger"]["generate_docs_override"]\n\n        compile_job_materialization_command_step = len(compile_run["run_steps"])\n        if compile_run_has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_id, compile_job_materialization_command_step\n\n    def _compile_dbt_cloud_job(self, dbt_cloud_job: Mapping[str, Any]) -> Tuple[int, int]:\n        # Retrieve the filters options from the dbt Cloud job's materialization command.\n        #\n        # There are three filters: `--select`, `--exclude`, and `--selector`.\n        materialization_command = self._job_commands[self._job_materialization_command_step]\n        parsed_args = DbtCloudCacheableAssetsDefinition.parse_dbt_command(materialization_command)\n        dbt_compile_options = DbtCloudCacheableAssetsDefinition.get_compile_filters(\n            parsed_args=parsed_args\n        )\n\n        # Add the partition variable as a variable to the dbt Cloud job command.\n        #\n        # If existing variables passed through the dbt Cloud job's command, an error will be\n        # raised. Since these are static variables anyways, they can be moved to the\n        # `dbt_project.yml` without loss of functionality.\n        #\n        # Since we're only doing this to generate the dependency structure, just use an arbitrary\n        # partition key (e.g. the last one) to retrieve the partition variable.\n        if parsed_args.vars and parsed_args.vars != "{}":\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{dbt_cloud_job['name']}' ({dbt_cloud_job['id']}) must not have"\n                " variables defined from `--vars` in its `dbt run` or `dbt build` command."\n                " Instead, declare the variables in the `dbt_project.yml` file. Received commands:"\n                f" {self._job_commands}."\n            )\n\n        if self._partitions_def and self._partition_key_to_vars_fn:\n            last_partition_key = self._partitions_def.get_last_partition_key()\n            if last_partition_key is None:\n                check.failed("PartitionsDefinition has no partitions")\n            partition_var = self._partition_key_to_vars_fn(last_partition_key)\n\n            dbt_compile_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n        # We need to retrieve the dependency structure for the assets in the dbt Cloud project.\n        # However, we can't just use the dependency structure from the latest run, because\n        # this historical structure may not be up-to-date with the current state of the project.\n        #\n        # By always doing a compile step, we can always get the latest dependency structure.\n        # This incurs some latency, but at least it doesn't run through the entire materialization\n        # process.\n        dbt_compile_command = f"dbt compile {' '.join(dbt_compile_options)}"\n        compile_run_dbt_output = self._dbt_cloud.run_job_and_poll(\n            job_id=self._job_id,\n            cause="Generating software-defined assets for Dagster.",\n            steps_override=[dbt_compile_command],\n        )\n\n        # Target the compile execution step when retrieving run artifacts, rather than assuming\n        # that the last step is the correct target.\n        #\n        # Here, we ignore the `dbt docs generate` step.\n        compile_job_materialization_command_step = len(\n            compile_run_dbt_output.run_details.get("run_steps", [])\n        )\n        if self._has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_dbt_output.run_id, compile_job_materialization_command_step\n\n    def _get_dbt_nodes_and_dependencies(\n        self,\n    ) -> Tuple[Mapping[str, Any], Mapping[str, FrozenSet[str]]]:\n        """For a given dbt Cloud job, fetch the latest run's dependency structure of executed nodes."""\n        # Fetch information about the job.\n        job = self._dbt_cloud.get_job(job_id=self._job_id)\n        self._project_id = job["project_id"]\n        self._has_generate_docs = job["generate_docs"]\n\n        # We constraint the kinds of dbt Cloud jobs that we support running.\n        #\n        # A simple constraint is that we only support jobs that run multiple steps,\n        # but it must contain one of either `dbt run` or `dbt build`.\n        #\n        # As a reminder, `dbt deps` is automatically run before the job's configured commands.\n        # And if the settings are enabled, `dbt docs generate` and `dbt source freshness` can\n        # automatically run after the job's configured commands.\n        #\n        # These commands that execute before and after the job's configured commands do not count\n        # towards the single command constraint.\n        self._job_commands = job["execute_steps"]\n        self._job_materialization_command_step = (\n            DbtCloudCacheableAssetsDefinition.get_job_materialization_command_step(\n                execute_steps=self._job_commands\n            )\n        )\n\n        # Determine whether to use a cached compile run. This should only be set up if the user is\n        # using a GitHub action along with their dbt project.\n        dbt_cloud_job_env_vars = self._dbt_cloud.get_job_environment_variables(\n            project_id=self._project_id, job_id=self._job_id\n        )\n        compile_run_id = (\n            dbt_cloud_job_env_vars.get(DAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR, {})\n            .get("job", {})\n            .get("value")\n        )\n\n        compile_run_id, compile_job_materialization_command_step = (\n            # If a compile run is cached, then use it.\n            self._get_cached_compile_dbt_cloud_job_run(compile_run_id=int(compile_run_id))\n            if compile_run_id\n            # Otherwise, compile the dbt Cloud project in an ad-hoc manner.\n            else self._compile_dbt_cloud_job(dbt_cloud_job=job)\n        )\n\n        manifest_json = self._dbt_cloud.get_manifest(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n        run_results_json = self._dbt_cloud.get_run_results(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n\n        # Filter the manifest to only include the nodes that were executed.\n        dbt_nodes: Dict[str, Any] = {\n            **manifest_json.get("nodes", {}),\n            **manifest_json.get("sources", {}),\n            **manifest_json.get("metrics", {}),\n        }\n        executed_node_ids: Set[str] = set(\n            result["unique_id"] for result in run_results_json["results"]\n        )\n\n        # If there are no executed nodes, then there are no assets to generate.\n        # Inform the user to inspect their dbt Cloud job's command.\n        if not executed_node_ids:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{job['name']}' ({job['id']}) does not generate any "\n                "software-defined assets. Ensure that your dbt project has nodes to execute, "\n                "and that your dbt Cloud job's materialization command has the proper filter "\n                f"options applied. Received commands: {self._job_commands}."\n            )\n\n        # Generate the dependency structure for the executed nodes.\n        dbt_dependencies = get_deps(\n            dbt_nodes=dbt_nodes,\n            selected_unique_ids=executed_node_ids,\n            asset_resource_types=ASSET_RESOURCE_TYPES,\n        )\n\n        return dbt_nodes, dbt_dependencies\n\n    def _build_dbt_cloud_assets_cacheable_data(\n        self, dbt_nodes: Mapping[str, Any], dbt_dependencies: Mapping[str, FrozenSet[str]]\n    ) -> AssetsDefinitionCacheableData:\n        """Given all of the nodes and dependencies for a dbt Cloud job, build the cacheable\n        representation that generate the asset definition for the job.\n        """\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props):\n                return self._node_info_to_asset_key(dbt_resource_props)\n\n            @classmethod\n            def get_description(cls, dbt_resource_props):\n                # We shouldn't display the raw sql. Instead, inspect if dbt docs were generated,\n                # and attach metadata to link to the docs.\n                return default_description_fn(dbt_resource_props, display_raw_sql=False)\n\n            @classmethod\n            def get_group_name(cls, dbt_resource_props):\n                return self._node_info_to_group_fn(dbt_resource_props)\n\n            @classmethod\n            def get_freshness_policy(cls, dbt_resource_props):\n                return self._node_info_to_freshness_policy_fn(dbt_resource_props)\n\n            @classmethod\n            def get_auto_materialize_policy(cls, dbt_resource_props):\n                return self._node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n        (\n            asset_deps,\n            asset_ins,\n            asset_outs,\n            group_names_by_key,\n            freshness_policies_by_key,\n            auto_materialize_policies_by_key,\n            _,\n            fqns_by_output_name,\n            metadata_by_output_name,\n        ) = get_asset_deps(\n            dbt_nodes=dbt_nodes,\n            deps=dbt_dependencies,\n            # TODO: In the future, allow the IO manager to be specified.\n            io_manager_key=None,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n            manifest=None,\n        )\n\n        return AssetsDefinitionCacheableData(\n            # TODO: In the future, we should allow additional upstream assets to be specified.\n            keys_by_input_name={\n                input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n            },\n            keys_by_output_name={\n                output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n            },\n            internal_asset_deps={\n                asset_outs[asset_key][0]: asset_deps for asset_key, asset_deps in asset_deps.items()\n            },\n            # We don't rely on a static group name. Instead, we map over the dbt metadata to\n            # determine the group name for each asset.\n            group_name=None,\n            metadata_by_output_name={\n                output_name: self._build_dbt_cloud_assets_metadata(dbt_metadata)\n                for output_name, dbt_metadata in metadata_by_output_name.items()\n            },\n            # TODO: In the future, we should allow the key prefix to be specified.\n            key_prefix=None,\n            can_subset=True,\n            extra_metadata={\n                "job_id": self._job_id,\n                "job_commands": self._job_commands,\n                "job_materialization_command_step": self._job_materialization_command_step,\n                "group_names_by_output_name": {\n                    asset_outs[asset_key][0]: group_name\n                    for asset_key, group_name in group_names_by_key.items()\n                },\n                "fqns_by_output_name": fqns_by_output_name,\n            },\n            freshness_policies_by_output_name={\n                asset_outs[asset_key][0]: freshness_policy\n                for asset_key, freshness_policy in freshness_policies_by_key.items()\n            },\n            auto_materialize_policies_by_output_name={\n                asset_outs[asset_key][0]: auto_materialize_policy\n                for asset_key, auto_materialize_policy in auto_materialize_policies_by_key.items()\n            },\n        )\n\n    def _build_dbt_cloud_assets_metadata(self, dbt_metadata: Dict[str, Any]) -> MetadataUserInput:\n        metadata = {\n            "dbt Cloud Job": MetadataValue.url(\n                self._dbt_cloud.build_url_for_job(\n                    project_id=self._project_id,\n                    job_id=self._job_id,\n                )\n            ),\n        }\n\n        if self._has_generate_docs:\n            metadata["dbt Cloud Documentation"] = MetadataValue.url(\n                self._dbt_cloud.build_url_for_cloud_docs(\n                    job_id=self._job_id,\n                    resource_type=dbt_metadata["resource_type"],\n                    unique_id=dbt_metadata["unique_id"],\n                )\n            )\n\n        return metadata\n\n    def _build_dbt_cloud_assets_from_cacheable_data(\n        self, assets_definition_cacheable_data: AssetsDefinitionCacheableData\n    ) -> AssetsDefinition:\n        metadata = cast(Mapping[str, Any], assets_definition_cacheable_data.extra_metadata)\n        job_id = cast(int, metadata["job_id"])\n        job_commands = cast(List[str], list(metadata["job_commands"]))\n        job_materialization_command_step = cast(int, metadata["job_materialization_command_step"])\n        group_names_by_output_name = cast(Mapping[str, str], metadata["group_names_by_output_name"])\n        fqns_by_output_name = cast(Mapping[str, List[str]], metadata["fqns_by_output_name"])\n\n        @multi_asset(\n            name=f"dbt_cloud_job_{job_id}",\n            deps=list((assets_definition_cacheable_data.keys_by_input_name or {}).values()),\n            outs={\n                output_name: AssetOut(\n                    key=asset_key,\n                    group_name=group_names_by_output_name.get(output_name),\n                    freshness_policy=(\n                        assets_definition_cacheable_data.freshness_policies_by_output_name or {}\n                    ).get(\n                        output_name,\n                    ),\n                    auto_materialize_policy=(\n                        assets_definition_cacheable_data.auto_materialize_policies_by_output_name\n                        or {}\n                    ).get(\n                        output_name,\n                    ),\n                    metadata=(assets_definition_cacheable_data.metadata_by_output_name or {}).get(\n                        output_name\n                    ),\n                    is_required=False,\n                )\n                for output_name, asset_key in (\n                    assets_definition_cacheable_data.keys_by_output_name or {}\n                ).items()\n            },\n            internal_asset_deps={\n                output_name: set(asset_deps)\n                for output_name, asset_deps in (\n                    assets_definition_cacheable_data.internal_asset_deps or {}\n                ).items()\n            },\n            partitions_def=self._partitions_def,\n            can_subset=assets_definition_cacheable_data.can_subset,\n            required_resource_keys={"dbt_cloud"},\n            compute_kind="dbt",\n        )\n        def _assets(context: AssetExecutionContext):\n            dbt_cloud = cast(DbtCloudClient, context.resources.dbt_cloud)\n\n            # Add the partition variable as a variable to the dbt Cloud job command.\n            dbt_options: List[str] = []\n            if context.has_partition_key and self._partition_key_to_vars_fn:\n                partition_var = self._partition_key_to_vars_fn(context.partition_key)\n\n                dbt_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n            # Prepare the materialization step to be overriden with the selection filter\n            materialization_command = job_commands[job_materialization_command_step]\n\n            # Map the selected outputs to dbt models that should be materialized.\n            #\n            # HACK: This selection filter works even if an existing `--select` is specified in the\n            # dbt Cloud job. We take advantage of the fact that the last `--select` will be used.\n            #\n            # This is not ideal, as the triggered run for the dbt Cloud job will still have both\n            # `--select` options when displayed in the UI, but parsing the command line argument\n            # to remove the initial select using argparse.\n            if len(context.selected_output_names) != len(\n                assets_definition_cacheable_data.keys_by_output_name or {}\n            ):\n                selected_models = [\n                    ".".join(fqns_by_output_name[output_name])\n                    for output_name in context.selected_output_names\n                ]\n\n                dbt_options.append(f"--select {' '.join(sorted(selected_models))}")\n\n                # If the `--selector` option is used, we need to remove it from the command, since\n                # it disables other selection options from being functional.\n                #\n                # See https://docs.getdbt.com/reference/node-selection/syntax for details.\n                split_materialization_command = shlex.split(materialization_command)\n                if "--selector" in split_materialization_command:\n                    idx = split_materialization_command.index("--selector")\n\n                    materialization_command = " ".join(\n                        split_materialization_command[:idx]\n                        + split_materialization_command[idx + 2 :]\n                    )\n\n            job_commands[job_materialization_command_step] = (\n                f"{materialization_command} {' '.join(dbt_options)}".strip()\n            )\n\n            # Run the dbt Cloud job to rematerialize the assets.\n            dbt_cloud_output = dbt_cloud.run_job_and_poll(\n                job_id=job_id,\n                cause=f"Materializing software-defined assets in Dagster run {context.run_id[:8]}",\n                steps_override=job_commands,\n            )\n\n            # Target the materialization step when retrieving run artifacts, rather than assuming\n            # that the last step is the correct target.\n            #\n            # We ignore the commands in front of the materialization command. And again, we ignore\n            # the `dbt docs generate` step.\n            materialization_command_step = len(dbt_cloud_output.run_details.get("run_steps", []))\n            materialization_command_step -= len(job_commands) - job_materialization_command_step - 1\n            if dbt_cloud_output.run_details.get("job", {}).get("generate_docs"):\n                materialization_command_step -= 1\n\n            # TODO: Assume the run completely fails or completely succeeds.\n            # In the future, we can relax this assumption.\n            manifest_json = dbt_cloud.get_manifest(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n            run_results_json = self._dbt_cloud.get_run_results(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n\n            for result in run_results_json.get("results", []):\n                yield from result_to_events(\n                    result=result,\n                    docs_url=dbt_cloud_output.docs_url,\n                    node_info_to_asset_key=self._node_info_to_asset_key,\n                    manifest_json=manifest_json,\n                    # TODO: In the future, allow arbitrary mappings to Dagster output metadata from\n                    # the dbt metadata.\n                    extra_metadata=None,\n                    generate_asset_outputs=True,\n                )\n\n        return _assets\n\n\n
[docs]@experimental\n@experimental_param(param="partitions_def")\n@experimental_param(param="partition_key_to_vars_fn")\ndef load_assets_from_dbt_cloud_job(\n dbt_cloud: ResourceDefinition,\n job_id: int,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n) -> CacheableAssetsDefinition:\n """Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\n determine the set of dbt models, the project is compiled to generate the necessary artifacts\n that define the dbt models and their dependencies.\n\n One Dagster asset is created for each dbt model.\n\n Args:\n dbt_cloud (ResourceDefinition): The dbt Cloud resource to use to connect to the dbt Cloud API.\n job_id (int): The ID of the dbt Cloud job to load assets from.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt metadata and returns the AssetKey that you want to represent a given model or\n source. By default: dbt model -> AssetKey([model_name]) and\n dbt source -> AssetKey([source_name, table_name])\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]):\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]):\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"})\n\n Returns:\n CacheableAssetsDefinition: A definition for the loaded assets.\n\n Examples:\n .. code-block:: python\n\n from dagster import repository\n from dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\n DBT_CLOUD_JOB_ID = 1234\n\n dbt_cloud = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n dbt_cloud_assets = load_assets_from_dbt_cloud_job(\n dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n )\n\n\n @repository\n def dbt_cloud_sandbox():\n return [dbt_cloud_assets]\n """\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n return DbtCloudCacheableAssetsDefinition(\n dbt_cloud_resource_def=dbt_cloud,\n job_id=job_id,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom typing import List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\nclass DbtCloudRunOpConfig(Config):\n    job_id: int = Field(\n        description=(\n            "The integer ID of the relevant dbt Cloud job. You can find this value by going to the"\n            " details page of your job in the dbt Cloud UI. It will be the final number in the url,"\n            " e.g.:    "\n            " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n        )\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes."\n        ),\n    )\n\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context, config: DbtCloudRunOpConfig):\n """Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = dbt_cloud_run_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n config.job_id, poll_interval=config.poll_interval, poll_timeout=config.poll_timeout\n )\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom enum import Enum\nfrom typing import Any, Mapping, Optional, Sequence, cast\nfrom urllib.parse import urlencode, urljoin\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    IAttachDifferentObjectToOpContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_API_V2_PATH = "api/v2/accounts/"\nDBT_API_V3_PATH = "api/v3/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\nclass DbtCloudRunStatus(str, Enum):\n    QUEUED = "Queued"\n    STARTING = "Starting"\n    RUNNING = "Running"\n    SUCCESS = "Success"\n    ERROR = "Error"\n    CANCELLED = "Cancelled"\n\n\n# TODO: This resource should be a wrapper over an existing client for a accessing dbt Cloud,\n# rather than using requests to the API directly.\nclass DbtCloudClient:\n    """This class exposes methods on top of the dbt Cloud REST API v2.\n\n    For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n    response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n    """\n\n    def __init__(\n        self,\n        auth_token: str,\n        account_id: int,\n        disable_schedule_on_trigger: bool = True,\n        request_max_retries: int = 3,\n        request_retry_delay: float = 0.25,\n        dbt_cloud_host: str = DBT_DEFAULT_HOST,\n        log: logging.Logger = get_dagster_logger(),\n        log_requests: bool = False,\n    ):\n        self._auth_token = auth_token\n        self._account_id = account_id\n        self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n        self._request_max_retries = request_max_retries\n        self._request_retry_delay = request_retry_delay\n\n        self._dbt_cloud_host = dbt_cloud_host\n        self._log = log\n        self._log_requests = log_requests\n\n    @property\n    def api_v2_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V2_PATH)\n\n    @property\n    def api_v3_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V3_PATH)\n\n    def build_url_for_job(self, project_id: int, job_id: int) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"next/deploy/{self._account_id}/projects/{project_id}/jobs/{job_id}/",\n        )\n\n    def build_url_for_cloud_docs(self, job_id: int, resource_type: str, unique_id: str) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"/accounts/{self._account_id}/jobs/{job_id}/docs/#!/{resource_type}/{unique_id}",\n        )\n\n    def make_request(\n        self,\n        method: str,\n        endpoint: str,\n        data: Optional[Mapping[str, Any]] = None,\n        params: Optional[Mapping[str, Any]] = None,\n        return_text: bool = False,\n        base_url: Optional[str] = None,\n    ) -> Any:\n        """Creates and sends a request to the desired dbt Cloud API endpoint.\n\n        Args:\n            method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n            endpoint (str): The dbt Cloud API endpoint to send this request to.\n            data (Optional[Mapping[str, Any]]): JSON-formatable data string to be included in the request.\n            params (Optional[Mapping[str, Any]]): Payload to add to query string of the request.\n            return_text (bool): Override default behavior and return unparsed {"text": response.text}\n                blob instead of json.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        headers = {\n            "User-Agent": f"dagster-dbt/{__version__}",\n            "Content-Type": "application/json",\n            "Authorization": f"Bearer {self._auth_token}",\n        }\n        base_url = base_url or self.api_v2_base_url\n        url = urljoin(base_url, endpoint)\n\n        if self._log_requests:\n            self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n        num_retries = 0\n        while True:\n            try:\n                response = requests.request(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    data=json.dumps(data),\n                    params=params,\n                )\n                response.raise_for_status()\n                return {"text": response.text} if return_text else response.json()["data"]\n            except RequestException as e:\n                self._log.error("Request to dbt Cloud API failed: %s", e)\n                if num_retries == self._request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self._request_retry_delay)\n\n        raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n    def list_jobs(\n        self, project_id: int, order_by: Optional[str] = "-id"\n    ) -> Sequence[Mapping[str, Any]]:\n        """List all dbt jobs in a dbt Cloud project.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n\n        Returns:\n            List[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/jobs",\n            params={"project_id": project_id, "order_by": order_by},\n        )\n\n    def get_job(self, job_id: int) -> Mapping[str, Any]:\n        """Gets details about a given dbt job from the dbt Cloud API.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")\n\n    def update_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Updates specific properties of a dbt job.\n\n        Documentation on the full set of potential parameters can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be changed.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n\n        Examples:\n        .. code-block:: python\n\n            # disable schedule for job with id=12345\n            my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n        """\n        # API requires you to supply a bunch of values, so we can just use the current state\n        # as the defaults\n        job_data = self.get_job(job_id)\n        return self.make_request(\n            "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n        )\n\n    def run_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Initializes a run for a job.\n\n        Overrides for specific properties can be set by passing in values to the kwargs. A full list\n        of overridable properties can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be overridden.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        self._log.info(f"Initializing run for job with job_id={job_id}")\n        if "cause" not in kwargs:\n            kwargs["cause"] = "Triggered via Dagster"\n        resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n\n        has_schedule: bool = resp.get("job", {}).get("triggers", {}).get("schedule", False)\n        if has_schedule and self._disable_schedule_on_trigger:\n            self._log.info("Disabling dbt Cloud job schedule.")\n            self.update_job(job_id, triggers={"schedule": False})\n\n        self._log.info(\n            f"Run initialized with run_id={resp['id']}. View this run in "\n            f"the dbt Cloud UI: {resp['href']}"\n        )\n        return resp\n\n    def get_runs(\n        self,\n        include_related: Optional[Sequence[str]] = None,\n        job_id: Optional[int] = None,\n        order_by: Optional[str] = "-id",\n        offset: int = 0,\n        limit: int = 100,\n    ) -> Sequence[Mapping[str, object]]:\n        """Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\n        using the job_definition_id. It supports pagination using offset and limit as well and\n        can be configured to load a variety of related information about the runs.\n\n        Args:\n            include_related (Optional[List[str]]): A list of resources to include in the response\n                from dbt Cloud. This is technically a required field according to the API, but it\n                can be passed with an empty list where it will only load the default run\n                information. Valid values are "trigger", "job", "repository", and "environment".\n            job_definition_id (Optional[int]): This method can be optionally filtered to only\n                load runs for a specific job id if it is included here. If omitted it will pull\n                runs for every job.\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n            offset (int): An offset to apply when listing runs. Can be used to paginate results\n                when combined with order_by and limit. Defaults to 0.\n            limit (int): Limits the amount of rows returned by the API. Defaults to 100.\n\n        Returns:\n            List[Dict[str, Any]]: A list of dictionaries containing the runs and any included\n                related information.\n        """\n        query_dict = {\n            "include_related": include_related or [],\n            "order_by": order_by,\n            "offset": offset,\n            "limit": limit,\n        }\n        if job_id:\n            query_dict["job_definition_id"] = job_id\n        return self.make_request("GET", f"{self._account_id}/runs/?{urlencode(query_dict)}")\n\n    def get_run(\n        self, run_id: int, include_related: Optional[Sequence[str]] = None\n    ) -> Mapping[str, Any]:\n        """Gets details about a specific job run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            include_related (List[str]): List of related fields to pull with the run. Valid values\n                are "trigger", "job", and "debug_logs".\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/{query_params}",\n        )\n\n    def get_run_steps(self, run_id: int) -> Sequence[str]:\n        """Gets the steps of an initialized dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            List[str, Any]: List of commands for each step of the run.\n        """\n        run_details = self.get_run(run_id, include_related=["trigger", "job"])\n        steps = run_details["job"]["execute_steps"]\n        steps_override = run_details["trigger"]["steps_override"]\n        return steps_override or steps\n\n    def cancel_run(self, run_id: int) -> Mapping[str, Any]:\n        """Cancels a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        self._log.info(f"Cancelling run with id '{run_id}'")\n        return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")\n\n    def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> Sequence[str]:\n        """Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run\n\n        Returns:\n            List[str]: List of the paths of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return cast(\n            list,\n            self.make_request(\n                "GET",\n                f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n                data={"step": step} if step else None,\n            ),\n        )\n\n    def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n        """The string contents of a run artifact from a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            List[str]: List of the names of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n            data={"step": step} if step else None,\n            return_text=True,\n        )["text"]\n\n    def get_manifest(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a manifest.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the manifest.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))\n\n    def get_run_results(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a run_results.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the run_results.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))\n\n    def poll_run(\n        self,\n        run_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        href: Optional[str] = None,\n    ) -> Mapping[str, Any]:\n        """Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n        run does not complete successfully.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n            href (str): For internal use, generally should not be set manually.\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        status: Optional[str] = None\n\n        if href is None:\n            href = self.get_run(run_id).get("href")\n        assert isinstance(href, str), "Run must have an href"\n\n        poll_start = datetime.datetime.now()\n        try:\n            while True:\n                run_details = self.get_run(run_id)\n                status = run_details["status_humanized"]\n                self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n                # completed successfully\n                if status == DbtCloudRunStatus.SUCCESS:\n                    return self.get_run(run_id, include_related=["job", "trigger", "run_steps"])\n                elif status in [DbtCloudRunStatus.ERROR, DbtCloudRunStatus.CANCELLED]:\n                    break\n                elif status not in [\n                    DbtCloudRunStatus.QUEUED,\n                    DbtCloudRunStatus.STARTING,\n                    DbtCloudRunStatus.RUNNING,\n                ]:\n                    check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n                if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n                    seconds=poll_timeout\n                ):\n                    self.cancel_run(run_id)\n                    raise Failure(\n                        f"Run {run_id} timed out after "\n                        f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n                        metadata={"run_page_url": MetadataValue.url(href)},\n                    )\n\n                # Sleep for the configured time interval before polling again.\n                time.sleep(poll_interval)\n        finally:\n            if status not in (\n                DbtCloudRunStatus.SUCCESS,\n                DbtCloudRunStatus.ERROR,\n                DbtCloudRunStatus.CANCELLED,\n            ):\n                self.cancel_run(run_id)\n\n        run_details = self.get_run(run_id, include_related=["trigger"])\n        raise Failure(\n            f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n            metadata={\n                "run_details": MetadataValue.json(run_details),\n                "run_page_url": MetadataValue.url(href),\n            },\n        )\n\n    def run_job_and_poll(\n        self,\n        job_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        **kwargs,\n    ) -> DbtCloudOutput:\n        """Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n        if the run does not complete successfully.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n\n        Returns:\n            :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n                parsed run results.\n        """\n        run_details = self.run_job(job_id, **kwargs)\n        run_id = run_details["id"]\n        href = run_details["href"]\n        final_run_details = self.poll_run(\n            run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n        )\n        try:\n            run_results = self.get_run_results(run_id)\n        # if you fail to get run_results for this job, just leave it empty\n        except Failure:\n            self._log.info(\n                "run_results.json not available for this run. Defaulting to empty value."\n            )\n            run_results = {}\n        output = DbtCloudOutput(run_details=final_run_details, result=run_results)\n        if output.docs_url:\n            self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n        return output\n\n    def get_job_environment_variables(self, project_id: int, job_id: int) -> Mapping[str, Any]:\n        """Get the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/projects/{project_id}/environment-variables/job",\n            params={"job_definition_id": job_id},\n            base_url=self.api_v3_base_url,\n        )\n\n    def set_job_environment_variable(\n        self, project_id: int, job_id: int, environment_variable_id: int, name: str, value: str\n    ) -> Mapping[str, Any]:\n        """Set the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            name (str): The name of the environment variable to set.\n            value (str): The raw value of the environment variable.\n        """\n        return self.make_request(\n            "POST",\n            f"{self._account_id}/projects/{project_id}/environment-variables/{environment_variable_id}",\n            data={\n                "id": environment_variable_id,\n                "account_id": self._account_id,\n                "project_id": project_id,\n                "job_definition_id": job_id,\n                "type": "job",\n                "name": name,\n                "raw_value": value,\n            },\n            base_url=self.api_v3_base_url,\n        )\n\n\nclass DbtCloudResource(DbtCloudClient):\n    pass\n\n\n
[docs]class DbtCloudClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """This resource helps interact with dbt Cloud connectors."""\n\n auth_token: str = Field(\n description=(\n "dbt Cloud API Token. User tokens can be found in the [dbt Cloud"\n " UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud"\n " Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for"\n " instructions on creating a Service Account token."\n ),\n )\n account_id: int = Field(\n description=(\n "dbt Cloud Account ID. This value can be found in the url of a variety of views in"\n " the dbt Cloud UI, e.g."\n " https://cloud.getdbt.com/#/accounts/{account_id}/settings/."\n ),\n )\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n dbt_cloud_host: str = Field(\n default=DBT_DEFAULT_HOST,\n description=(\n "The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/)."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_dbt_client(self) -> DbtCloudClient:\n context = self.get_resource_context()\n assert context.log\n\n return DbtCloudClient(\n auth_token=self.auth_token,\n account_id=self.account_id,\n disable_schedule_on_trigger=self.disable_schedule_on_trigger,\n request_max_retries=self.request_max_retries,\n request_retry_delay=self.request_retry_delay,\n log=context.log,\n dbt_cloud_host=self.dbt_cloud_host,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_dbt_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DbtCloudClientResource.to_config_schema(),\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResource:\n """This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResource(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n dbt_cloud_host=context.resource_config["dbt_cloud_host"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.resources"}}, "core": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources

\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._annotations import deprecated, public\nfrom dagster._config.pythonic_config import ConfigurableResource, IAttachDifferentObjectToOpContext\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom ..dbt_resource import DbtClient\nfrom .types import DbtCliOutput\nfrom .utils import (\n    DEFAULT_DBT_TARGET_PATH,\n    execute_cli,\n    execute_cli_stream,\n    parse_manifest,\n    parse_run_results,\n    remove_run_results,\n)\n\nDEFAULT_DBT_EXECUTABLE = "dbt"\n\n# The set of dbt cli commands that result in the creation of a run_results.json output file\n# https://docs.getdbt.com/reference/artifacts/run-results-json\nDBT_RUN_RESULTS_COMMANDS = ["run", "test", "seed", "snapshot", "docs generate", "build"]\n\n# The following config fields correspond to flags that apply to all dbt CLI commands. For details\n# on dbt CLI flags, see\n# https://github.com/fishtown-analytics/dbt/blob/1f8e29276e910c697588c43f08bc881379fff178/core/dbt/main.py#L260-L329\n\nCOMMON_OPTION_KEYS = {\n    "warn_error",\n    "dbt_executable",\n    "ignore_handled_error",\n    "target_path",\n    "docs_url",\n    "json_log_format",\n    "capture_logs",\n    "debug",\n}\n\n\nclass ConfigurableResourceWithCliFlags(ConfigurableResource):\n    project_dir: str = Field(\n        default=".",\n        description=(\n            "Which directory to look in for the dbt_project.yml file. Default is the current "\n            "working directory and its parents."\n        ),\n    )\n    profiles_dir: Optional[str] = Field(\n        default=None,\n        description=(\n            "Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or "\n            "$HOME/.dbt"\n        ),\n    )\n    profile: Optional[str] = Field(\n        default=None, description="Which profile to load. Overrides setting in dbt_project.yml."\n    )\n    target: Optional[str] = Field(\n        default=None, description="Which target to load for the given profile."\n    )\n    vars: Optional[Mapping[str, Any]] = Field(\n        default=None,\n        description=(\n            "Supply variables to the project. This argument overrides variables defined in your "\n            "dbt_project.yml file. This argument should be a dictionary, eg. "\n            "{'my_variable': 'my_value'}"\n        ),\n    )\n    bypass_cache: bool = Field(\n        default=False, description="If set, bypass the adapter-level cache of database state"\n    )\n    warn_error: bool = Field(\n        default=False,\n        description=(\n            "If dbt would normally warn, instead raise an exception. Examples include --models "\n            "that selects nothing, deprecations, configurations with no associated models, "\n            "invalid test configurations, and missing sources/refs in tests."\n        ),\n    )\n    dbt_executable: str = Field(\n        default=DEFAULT_DBT_EXECUTABLE,\n        description=f"Path to the dbt executable. Default is {DEFAULT_DBT_EXECUTABLE}",\n    )\n    ignore_handled_error: bool = Field(\n        default=False,\n        description=(\n            "When True, will not raise an exception when the dbt CLI returns error code 1. "\n            "Default is False."\n        ),\n    )\n    target_path: str = Field(\n        default=DEFAULT_DBT_TARGET_PATH,\n        description=(\n            "The directory path for target if different from the default `target-path` in "\n            "your dbt project configuration file."\n        ),\n    )\n    docs_url: Optional[str] = Field(\n        default=None, description="The url for where dbt docs are being served for this project."\n    )\n    json_log_format: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--log-format json` flag, allowing "\n            "Dagster to parse the log messages and emit simpler log messages to the event log."\n        ),\n    )\n    capture_logs: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--capture-output` flag, allowing "\n            "Dagster to capture the logs and emit them to the event log."\n        ),\n    )\n    debug: bool = Field(\n        default=False,\n        description=(\n            "When True, dbt will invoked with the `--debug` flag, which will print "\n            "additional debug information to the console."\n        ),\n    )\n\n\nclass DbtCliClient(DbtClient):\n    """A resource that allows you to execute dbt cli commands.\n\n    For the most up-to-date documentation on the specific parameters available to you for each\n    command, check out the dbt docs:\n\n    https://docs.getdbt.com/reference/commands/run\n\n    To use this as a dagster resource, we recommend using\n    :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n    """\n\n    def __init__(\n        self,\n        executable: str,\n        default_flags: Mapping[str, Any],\n        warn_error: bool,\n        ignore_handled_error: bool,\n        target_path: str,\n        logger: Optional[Any] = None,\n        docs_url: Optional[str] = None,\n        json_log_format: bool = True,\n        capture_logs: bool = True,\n        debug: bool = False,\n    ):\n        self._default_flags = default_flags\n        self._executable = executable\n        self._warn_error = warn_error\n        self._ignore_handled_error = ignore_handled_error\n        self._target_path = target_path\n        self._docs_url = docs_url\n        self._json_log_format = json_log_format\n        self._capture_logs = capture_logs\n        self._debug = debug\n        super().__init__(logger)\n\n    @property\n    def default_flags(self) -> Mapping[str, Any]:\n        """A set of params populated from resource config that are passed as flags to each dbt CLI command."""\n        return self._format_params(self._default_flags, replace_underscores=True)\n\n    @property\n    def strict_flags(self) -> Set[str]:\n        """A set of flags that should not be auto-populated from the default flags unless they are\n        arguments to the associated function.\n        """\n        return {"models", "exclude", "select"}\n\n    def _get_flags_dict(self, kwargs) -> Mapping[str, Any]:\n        extra_flags = {} if kwargs is None else kwargs\n\n        # remove default flags that are declared as "strict" and not explicitly passed in\n        default_flags = {\n            k: v\n            for k, v in self.default_flags.items()\n            if not (k in self.strict_flags and k not in extra_flags)\n        }\n\n        return merge_dicts(\n            default_flags, self._format_params(extra_flags, replace_underscores=True)\n        )\n\n    @public\n    def cli(self, command: str, **kwargs) -> DbtCliOutput:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        command = check.str_param(command, "command")\n        return execute_cli(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            target_path=self._target_path,\n            docs_url=self._docs_url,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        )\n\n    def cli_stream_json(self, command: str, **kwargs) -> Iterator[Mapping[str, Any]]:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n        """\n        check.invariant(self._json_log_format, "Cannot stream JSON if json_log_format is False.")\n        for event in execute_cli_stream(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        ):\n            if event.parsed_json_line is not None:\n                yield event.parsed_json_line\n\n    @public\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n            select (List[str], optional): the models to include in compilation.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("compile", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n            select (List[str], optional): the models to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("run", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("snapshot", select=select, exclude=exclude, **kwargs)\n\n    @public\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n            select (List[str], optional): the models to include in testing.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        if data and schema:\n            # do not include these arguments if both are True, as these are deprecated in later\n            # versions of dbt, and for older versions the functionality is the same regardless of\n            # if both are set or neither are set.\n            return self.cli("test", models=models, exclude=exclude, select=select, **kwargs)\n        return self.cli(\n            "test",\n            models=models,\n            exclude=exclude,\n            data=data,\n            schema=schema,\n            select=select,\n            **kwargs,\n        )\n\n    @public\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)\n\n    @public\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)\n\n    @public\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("build", select=select, **kwargs)\n\n    @public\n    def freshness(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the sources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("source snapshot-freshness", select=select, **kwargs)\n\n    @public\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("docs generate", compile=compile_project, **kwargs)\n\n    @public\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtCliOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli(f"run-operation {macro}", args=args, **kwargs)\n\n    @public\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_run_results(project_dir, target_path)\n\n    @public\n    def remove_run_results_json(self, **kwargs):\n        """Remove the run_results.json file from previous runs (if it exists)."""\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        remove_run_results(project_dir, target_path)\n\n    @public\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_manifest(project_dir, target_path)\n\n\nclass DbtCliClientResource(ConfigurableResourceWithCliFlags, IAttachDifferentObjectToOpContext):\n    """Resource which issues dbt CLI commands against a configured dbt project."""\n\n    class Config:\n        extra = "allow"\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def get_dbt_client(self) -> DbtCliClient:\n        context = self.get_resource_context()\n        default_flags = {\n            k: v\n            for k, v in self._get_non_none_public_field_values().items()\n            if k not in COMMON_OPTION_KEYS\n        }\n\n        return DbtCliClient(\n            executable=self.dbt_executable,\n            default_flags=default_flags,\n            warn_error=self.warn_error,\n            ignore_handled_error=self.ignore_handled_error,\n            target_path=self.target_path,\n            docs_url=self.docs_url,\n            logger=context.log,\n            json_log_format=self.json_log_format,\n            capture_logs=self.capture_logs,\n            debug=self.debug,\n        )\n\n    def get_object_to_set_on_execution_context(self) -> Any:\n        return self.get_dbt_client()\n\n\n
[docs]@deprecated(breaking_version="0.21", additional_warn_text="Use DbtCliResource instead.")\n@dagster_maintained_resource\n@resource(config_schema=DbtCliClientResource.to_config_schema())\ndef dbt_cli_resource(context) -> DbtCliClient:\n """This resource issues dbt CLI commands against a configured dbt project. It is deprecated\n in favor of :py:class:`~dagster_dbt.DbtCliResource`.\n """\n # all config options that are intended to be used as flags for dbt commands\n\n default_flags = {\n k: v for k, v in context.resource_config.items() if k not in COMMON_OPTION_KEYS\n }\n return DbtCliClient(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n docs_url=context.resource_config.get("docs_url"),\n capture_logs=context.resource_config["capture_logs"],\n json_log_format=context.resource_config["json_log_format"],\n debug=context.resource_config["debug"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/core/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources"}, "resources_v2": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources_v2

\nimport atexit\nimport contextlib\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport uuid\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Union,\n)\n\nimport dateutil.parser\nimport orjson\nfrom dagster import (\n    AssetCheckResult,\n    AssetCheckSeverity,\n    AssetObservation,\n    AssetsDefinition,\n    ConfigurableResource,\n    Output,\n    get_dagster_logger,\n)\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidPropertyError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dbt.contracts.results import NodeStatus, TestStatus\nfrom dbt.node_types import NodeType\nfrom dbt.version import __version__ as dbt_version\nfrom packaging import version\nfrom pydantic import Field, root_validator, validator\nfrom typing_extensions import Literal\n\nfrom ..asset_utils import (\n    get_manifest_and_translator_from_dbt_assets,\n    output_name_fn,\n)\nfrom ..dagster_dbt_translator import DagsterDbtTranslator\nfrom ..dbt_manifest import DbtManifestParam, validate_manifest\nfrom ..errors import DagsterDbtCliRuntimeError\nfrom ..utils import ASSET_RESOURCE_TYPES, get_dbt_resource_props_by_dbt_unique_id_from_manifest\n\nlogger = get_dagster_logger()\n\n\nDBT_PROJECT_YML_NAME = "dbt_project.yml"\nDBT_PROFILES_YML_NAME = "profiles.yml"\nPARTIAL_PARSE_FILE_NAME = "partial_parse.msgpack"\n\n\ndef _get_dbt_target_path() -> Path:\n    return Path(os.getenv("DBT_TARGET_PATH", "target"))\n\n\n
[docs]@dataclass\nclass DbtCliEventMessage:\n """The representation of a dbt CLI event.\n\n Args:\n raw_event (Dict[str, Any]): The raw event dictionary.\n See https://docs.getdbt.com/reference/events-logging#structured-logging for more\n information.\n """\n\n raw_event: Dict[str, Any]\n\n @classmethod\n def from_log(cls, log: str) -> "DbtCliEventMessage":\n """Parse an event according to https://docs.getdbt.com/reference/events-logging#structured-logging.\n\n We assume that the log format is json.\n """\n raw_event: Dict[str, Any] = orjson.loads(log)\n\n return cls(raw_event=raw_event)\n\n def __str__(self) -> str:\n return self.raw_event["info"]["msg"]\n\n
[docs] @public\n def to_default_asset_events(\n self,\n manifest: DbtManifestParam,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n ) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Convert a dbt CLI event to a set of corresponding Dagster events.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The dbt manifest blob.\n dagster_dbt_translator (DagsterDbtTranslator): Optionally, a custom translator for\n linking dbt nodes to Dagster assets.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n """\n if self.raw_event["info"]["level"] == "debug":\n return\n\n event_node_info: Dict[str, Any] = self.raw_event["data"].get("node_info")\n if not event_node_info:\n return\n\n manifest = validate_manifest(manifest)\n\n if not manifest:\n logger.info(\n "No dbt manifest was provided. Dagster events for dbt tests will not be created."\n )\n\n invocation_id: str = self.raw_event["info"]["invocation_id"]\n unique_id: str = event_node_info["unique_id"]\n node_resource_type: str = event_node_info["resource_type"]\n node_status: str = event_node_info["node_status"]\n\n is_node_successful = node_status == NodeStatus.Success\n is_node_finished = bool(event_node_info.get("node_finished_at"))\n if node_resource_type in NodeType.refable() and is_node_successful:\n started_at = dateutil.parser.isoparse(event_node_info["node_started_at"])\n finished_at = dateutil.parser.isoparse(event_node_info["node_finished_at"])\n duration_seconds = (finished_at - started_at).total_seconds()\n\n yield Output(\n value=None,\n output_name=output_name_fn(event_node_info),\n metadata={\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "Execution Duration": duration_seconds,\n },\n )\n elif manifest and node_resource_type == NodeType.Test and is_node_finished:\n upstream_unique_ids: List[str] = manifest["parent_map"][unique_id]\n test_resource_props = manifest["nodes"][unique_id]\n metadata = {\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "status": node_status,\n }\n\n is_asset_check = dagster_dbt_translator.settings.enable_asset_checks\n attached_node_unique_id = test_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n if is_asset_check and is_generic_test:\n is_test_successful = node_status == TestStatus.Pass\n severity = AssetCheckSeverity(test_resource_props["config"]["severity"].upper())\n\n attached_node_resource_props: Dict[str, Any] = manifest["nodes"].get(\n attached_node_unique_id\n ) or manifest["sources"].get(attached_node_unique_id)\n attached_node_asset_key = dagster_dbt_translator.get_asset_key(\n attached_node_resource_props\n )\n\n yield AssetCheckResult(\n passed=is_test_successful,\n asset_key=attached_node_asset_key,\n check_name=event_node_info["node_name"],\n metadata=metadata,\n severity=severity,\n )\n else:\n for upstream_unique_id in upstream_unique_ids:\n upstream_resource_props: Dict[str, Any] = manifest["nodes"].get(\n upstream_unique_id\n ) or manifest["sources"].get(upstream_unique_id)\n upstream_asset_key = dagster_dbt_translator.get_asset_key(\n upstream_resource_props\n )\n\n yield AssetObservation(\n asset_key=upstream_asset_key,\n metadata=metadata,\n )
\n\n\n
[docs]@dataclass\nclass DbtCliInvocation:\n """The representation of an invoked dbt command.\n\n Args:\n process (subprocess.Popen): The process running the dbt command.\n manifest (Mapping[str, Any]): The dbt manifest blob.\n project_dir (Path): The path to the dbt project.\n target_path (Path): The path to the dbt target folder.\n raise_on_error (bool): Whether to raise an exception if the dbt command fails.\n """\n\n process: subprocess.Popen\n manifest: Mapping[str, Any]\n dagster_dbt_translator: DagsterDbtTranslator\n project_dir: Path\n target_path: Path\n raise_on_error: bool\n\n @classmethod\n def run(\n cls,\n args: List[str],\n env: Dict[str, str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n project_dir: Path,\n target_path: Path,\n raise_on_error: bool,\n ) -> "DbtCliInvocation":\n # Attempt to take advantage of partial parsing. If there is a `partial_parse.msgpack` in\n # in the target folder, then copy it to the dynamic target path.\n #\n # This effectively allows us to skip the parsing of the manifest, which can be expensive.\n # See https://docs.getdbt.com/reference/programmatic-invocations#reusing-objects for more\n # details.\n current_target_path = _get_dbt_target_path()\n partial_parse_file_path = (\n current_target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n if current_target_path.is_absolute()\n else project_dir.joinpath(current_target_path, PARTIAL_PARSE_FILE_NAME)\n )\n partial_parse_destination_target_path = target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n\n if partial_parse_file_path.exists():\n logger.info(\n f"Copying `{partial_parse_file_path}` to `{partial_parse_destination_target_path}`"\n " to take advantage of partial parsing."\n )\n\n partial_parse_destination_target_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy(partial_parse_file_path, partial_parse_destination_target_path)\n\n # Create a subprocess that runs the dbt CLI command.\n logger.info(f"Running dbt command: `{' '.join(args)}`.")\n process = subprocess.Popen(\n args=args,\n stdout=subprocess.PIPE,\n stderr=subprocess.STDOUT,\n env=env,\n cwd=project_dir,\n )\n\n # Add handler to terminate child process if running.\n # See https://stackoverflow.com/a/18258391 for more details.\n def cleanup_dbt_subprocess(process: subprocess.Popen) -> None:\n if process.returncode is None:\n logger.info(\n "The main process is being terminated, but the dbt command has not yet"\n " completed. Terminating the execution of dbt command."\n )\n process.terminate()\n process.wait()\n\n atexit.register(cleanup_dbt_subprocess, process)\n\n return cls(\n process=process,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )\n\n
[docs] @public\n def wait(self) -> "DbtCliInvocation":\n """Wait for the dbt CLI process to complete.\n\n Returns:\n DbtCliInvocation: The current representation of the dbt CLI invocation.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n """\n list(self.stream_raw_events())\n\n return self
\n\n
[docs] @public\n def is_successful(self) -> bool:\n """Return whether the dbt CLI process completed successfully.\n\n Returns:\n bool: True, if the dbt CLI process returns with a zero exit code, and False otherwise.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\n if dbt_cli_invocation.is_successful():\n ...\n """\n return self.process.wait() == 0
\n\n
[docs] @public\n def stream(self) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Stream the events from the dbt CLI process and convert them to Dagster events.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n\n Examples:\n .. code-block:: python\n\n from pathlib import Path\n from dagster_dbt import DbtCliResource, dbt_assets\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n """\n for event in self.stream_raw_events():\n yield from event.to_default_asset_events(\n manifest=self.manifest, dagster_dbt_translator=self.dagster_dbt_translator\n )
\n\n
[docs] @public\n def stream_raw_events(self) -> Iterator[DbtCliEventMessage]:\n """Stream the events from the dbt CLI process.\n\n Returns:\n Iterator[DbtCliEventMessage]: An iterator of events from the dbt CLI process.\n """\n with self.process.stdout or contextlib.nullcontext():\n for raw_line in self.process.stdout or []:\n log: str = raw_line.decode().strip()\n try:\n event = DbtCliEventMessage.from_log(log=log)\n\n # Re-emit the logs from dbt CLI process into stdout.\n sys.stdout.write(str(event) + "\\n")\n sys.stdout.flush()\n\n yield event\n except:\n # If we can't parse the log, then just emit it as a raw log.\n sys.stdout.write(log + "\\n")\n sys.stdout.flush()\n\n # Ensure that the dbt CLI process has completed.\n self._raise_on_error()
\n\n
[docs] @public\n def get_artifact(\n self,\n artifact: Union[\n Literal["manifest.json"],\n Literal["catalog.json"],\n Literal["run_results.json"],\n Literal["sources.json"],\n ],\n ) -> Dict[str, Any]:\n """Retrieve a dbt artifact from the target path.\n\n See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.\n\n Args:\n artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]): The name of the artifact to retrieve.\n\n Returns:\n Dict[str, Any]: The artifact as a dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n\n # Retrieve the run_results.json artifact.\n run_results = dbt_cli_invocation.get_artifact("run_results.json")\n """\n artifact_path = self.target_path.joinpath(artifact)\n\n return orjson.loads(artifact_path.read_bytes())
\n\n def _raise_on_error(self) -> None:\n """Ensure that the dbt CLI process has completed. If the process has not successfully\n completed, then optionally raise an error.\n """\n if not self.is_successful() and self.raise_on_error:\n raise DagsterDbtCliRuntimeError(\n description=(\n f"The dbt CLI process failed with exit code {self.process.returncode}. Check"\n " the Dagster compute logs for the full information about the error, or view"\n f" the dbt debug log file: {self.target_path.joinpath('dbt.log')}."\n )\n )
\n\n\n
[docs]class DbtCliResource(ConfigurableResource):\n """A resource used to execute dbt CLI commands.\n\n Attributes:\n project_dir (str): The path to the dbt project directory. This directory should contain a\n `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more\n information.\n global_config_flags (List[str]): A list of global flags configuration to pass to the dbt CLI\n invocation. See https://docs.getdbt.com/reference/global-configs for a full list of\n configuration.\n profiles_dir (Optional[str]): The path to the directory containing your dbt `profiles.yml`.\n By default, the current working directory is used, which is the dbt project directory.\n See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n profile (Optional[str]): The profile from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n target (Optional[str]): The target from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n\n Examples:\n Creating a dbt resource with only a reference to ``project_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n Creating a dbt resource with a custom ``profiles_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n )\n\n Creating a dbt resource with a custom ``profile`` and ``target``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n profile="jaffle_shop",\n target="dev",\n )\n\n Creating a dbt resource with global configs, e.g. disabling colored logs with ``--no-use-color``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n global_config_flags=["--no-use-color"],\n )\n """\n\n project_dir: str = Field(\n ...,\n description=(\n "The path to your dbt project directory. This directory should contain a"\n " `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more"\n " information."\n ),\n )\n global_config_flags: List[str] = Field(\n default=[],\n description=(\n "A list of global flags configuration to pass to the dbt CLI invocation. See"\n " https://docs.getdbt.com/reference/global-configs for a full list of configuration."\n ),\n )\n profiles_dir: Optional[str] = Field(\n default=None,\n description=(\n "The path to the directory containing your dbt `profiles.yml`. By default, the current"\n " working directory is used, which is the dbt project directory."\n " See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for "\n " more information."\n ),\n )\n profile: Optional[str] = Field(\n default=None,\n description=(\n "The profile from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n target: Optional[str] = Field(\n default=None,\n description=(\n "The target from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n\n @classmethod\n def _validate_absolute_path_exists(cls, path: Union[str, Path]) -> Path:\n absolute_path = Path(path).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{path}' ('{absolute_path}') does not exist")\n\n return resolved_path\n\n @classmethod\n def _validate_path_contains_file(cls, path: Path, file_name: str, error_message: str):\n if not path.joinpath(file_name).exists():\n raise ValueError(error_message)\n\n @validator("project_dir", "profiles_dir", pre=True)\n def convert_path_to_str(cls, v: Any) -> Any:\n """Validate that the path is converted to a string."""\n if isinstance(v, Path):\n resolved_path = cls._validate_absolute_path_exists(v)\n\n absolute_path = Path(v).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{v}' ('{absolute_path}') does not exist")\n return os.fspath(resolved_path)\n\n return v\n\n @validator("project_dir")\n def validate_project_dir(cls, project_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(project_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROJECT_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROJECT_YML_NAME} file. Please"\n " specify a valid path to a dbt project."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @validator("profiles_dir")\n def validate_profiles_dir(cls, profiles_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(profiles_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROFILES_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROFILES_YML_NAME} file. Please"\n " specify a valid path to a dbt profile directory."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @root_validator(pre=True)\n def validate_dbt_version(cls, values: Dict[str, Any]) -> Dict[str, Any]:\n """Validate that the dbt version is supported."""\n if version.parse(dbt_version) < version.parse("1.4.0"):\n raise ValueError(\n "To use `dagster_dbt.DbtCliResource`, you must use `dbt-core>=1.4.0`. Currently,"\n f" you are using `dbt-core=={dbt_version}`. Please install a compatible dbt-core"\n " version."\n )\n\n return values\n\n def _get_unique_target_path(self, *, context: Optional[OpExecutionContext]) -> Path:\n """Get a unique target path for the dbt CLI invocation.\n\n Args:\n context (Optional[OpExecutionContext]): The execution context.\n\n Returns:\n str: A unique target path for the dbt CLI invocation.\n """\n unique_id = str(uuid.uuid4())[:7]\n path = unique_id\n if context:\n path = f"{context.op.name}-{context.run_id[:7]}-{unique_id}"\n\n current_target_path = _get_dbt_target_path()\n\n return current_target_path.joinpath(path)\n\n
[docs] @public\n def cli(\n self,\n args: List[str],\n *,\n raise_on_error: bool = True,\n manifest: Optional[DbtManifestParam] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n context: Optional[OpExecutionContext] = None,\n target_path: Optional[Path] = None,\n ) -> DbtCliInvocation:\n """Create a subprocess to execute a dbt CLI command.\n\n Args:\n args (List[str]): The dbt CLI command to execute.\n raise_on_error (bool): Whether to raise an exception if the dbt CLI command fails.\n manifest (Optional[Union[Mapping[str, Any], str, Path]]): The dbt manifest blob. If an\n execution context from within `@dbt_assets` is provided to the context argument,\n then the manifest provided to `@dbt_assets` will be used.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): The translator to link dbt\n nodes to Dagster assets. If an execution context from within `@dbt_assets` is\n provided to the context argument, then the dagster_dbt_translator provided to\n `@dbt_assets` will be used.\n context (Optional[OpExecutionContext]): The execution context from within `@dbt_assets`.\n target_path (Optional[Path]): An explicit path to a target folder to use to store and\n retrieve dbt artifacts when running a dbt CLI command. If not provided, a unique\n target path will be generated.\n\n Returns:\n DbtCliInvocation: A invocation instance that can be used to retrieve the output of the\n dbt CLI command.\n\n Examples:\n Streaming Dagster events for dbt asset materializations and observations:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n\n Retrieving a dbt artifact after streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context)\n\n yield from dbt_run_invocation.stream()\n\n # Retrieve the `run_results.json` dbt artifact as a dictionary:\n run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n # Retrieve the `run_results.json` dbt artifact as a file path:\n run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n\n Customizing the asset materialization metadata when streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n for dbt_event in dbt_cli_invocation.stream_raw_events():\n for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n if isinstance(dagster_event, Output):\n context.add_output_metadata(\n metadata={\n "my_custom_metadata": "my_custom_metadata_value",\n },\n output_name=dagster_event.output_name,\n )\n\n yield dagster_event\n\n Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n if dbt_run_invocation.is_successful():\n yield from dbt_run_invocation.stream()\n else:\n ...\n\n Invoking a dbt CLI command in a custom asset or op:\n\n .. code-block:: python\n\n import json\n\n from dagster import asset, op\n from dagster_dbt import DbtCliResource\n\n\n @asset\n def my_dbt_asset(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n @op\n def my_dbt_op(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n """\n target_path = target_path or self._get_unique_target_path(context=context)\n env = {\n **os.environ.copy(),\n # Run dbt with unbuffered output.\n "PYTHONUNBUFFERED": "1",\n # Disable anonymous usage statistics for performance.\n "DBT_SEND_ANONYMOUS_USAGE_STATS": "false",\n # The DBT_LOG_FORMAT environment variable must be set to `json`. We use this\n # environment variable to ensure that the dbt CLI outputs structured logs.\n "DBT_LOG_FORMAT": "json",\n # The DBT_TARGET_PATH environment variable is set to a unique value for each dbt\n # invocation so that artifact paths are separated.\n # See https://discourse.getdbt.com/t/multiple-run-results-json-and-manifest-json-files/7555\n # for more information.\n "DBT_TARGET_PATH": os.fspath(target_path),\n # The DBT_LOG_PATH environment variable is set to the same value as DBT_TARGET_PATH\n # so that logs for each dbt invocation has separate log files.\n "DBT_LOG_PATH": os.fspath(target_path),\n # The DBT_PROFILES_DIR environment variable is set to the path containing the dbt\n # profiles.yml file.\n # See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory\n # for more information.\n **({"DBT_PROFILES_DIR": self.profiles_dir} if self.profiles_dir else {}),\n }\n\n assets_def: Optional[AssetsDefinition] = None\n with suppress(DagsterInvalidPropertyError):\n assets_def = context.assets_def if context else None\n\n selection_args: List[str] = []\n dagster_dbt_translator = dagster_dbt_translator or DagsterDbtTranslator()\n if context and assets_def is not None:\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(\n [assets_def]\n )\n\n # When dbt is enabled with asset checks, we turn off any indirection with dbt selection.\n # This way, the Dagster context completely determines what is executed in a dbt\n # invocation with a subsetted selection.\n if (\n version.parse(dbt_version) >= version.parse("1.5.0")\n and dagster_dbt_translator.settings.enable_asset_checks\n ):\n env["DBT_INDIRECT_SELECTION"] = "empty"\n\n selection_args = get_subset_selection_for_context(\n context=context,\n manifest=manifest,\n select=context.op.tags.get("dagster-dbt/select"),\n exclude=context.op.tags.get("dagster-dbt/exclude"),\n )\n else:\n manifest = validate_manifest(manifest) if manifest else {}\n\n # TODO: verify that args does not have any selection flags if the context and manifest\n # are passed to this function.\n profile_args: List[str] = []\n if self.profile:\n profile_args = ["--profile", self.profile]\n\n if self.target:\n profile_args += ["--target", self.target]\n\n args = ["dbt"] + self.global_config_flags + args + profile_args + selection_args\n project_dir = Path(self.project_dir)\n\n if not target_path.is_absolute():\n target_path = project_dir.joinpath(target_path)\n\n return DbtCliInvocation.run(\n args=args,\n env=env,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )
\n\n\ndef get_subset_selection_for_context(\n context: OpExecutionContext,\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n) -> List[str]:\n """Generate a dbt selection string to materialize the selected resources in a subsetted execution context.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work.\n\n Args:\n context (OpExecutionContext): The execution context for the current execution step.\n select (Optional[str]): A dbt selection string to select resources to materialize.\n exclude (Optional[str]): A dbt selection string to exclude resources from materializing.\n\n Returns:\n List[str]: dbt CLI arguments to materialize the selected resources in a\n subsetted execution context.\n\n If the current execution context is not performing a subsetted execution,\n return CLI arguments composed of the inputed selection and exclusion arguments.\n """\n default_dbt_selection = []\n if select:\n default_dbt_selection += ["--select", select]\n if exclude:\n default_dbt_selection += ["--exclude", exclude]\n\n dbt_resource_props_by_output_name = get_dbt_resource_props_by_output_name(manifest)\n dbt_resource_props_by_test_name = get_dbt_resource_props_by_test_name(manifest)\n\n # TODO: this should be a property on the context if this is a permanent indicator for\n # determining whether the current execution context is performing a subsetted execution.\n is_subsetted_execution = len(context.selected_output_names) != len(\n context.assets_def.node_keys_by_output_name\n )\n if not is_subsetted_execution:\n logger.info(\n "A dbt subsetted execution is not being performed. Using the default dbt selection"\n f" arguments `{default_dbt_selection}`."\n )\n return default_dbt_selection\n\n selected_dbt_resources = []\n for output_name in context.selected_output_names:\n dbt_resource_props = dbt_resource_props_by_output_name[output_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(dbt_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n for _, check_name in context.selected_asset_check_keys:\n test_resource_props = dbt_resource_props_by_test_name[check_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(test_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n # Take the union of all the selected resources.\n # https://docs.getdbt.com/reference/node-selection/set-operators#unions\n union_selected_dbt_resources = ["--select"] + [" ".join(selected_dbt_resources)]\n\n logger.info(\n "A dbt subsetted execution is being performed. Overriding default dbt selection"\n f" arguments `{default_dbt_selection}` with arguments: `{union_selected_dbt_resources}`"\n )\n\n return union_selected_dbt_resources\n\n\ndef get_dbt_resource_props_by_output_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n\n return {\n output_name_fn(node): node\n for node in node_info_by_dbt_unique_id.values()\n if node["resource_type"] in ASSET_RESOURCE_TYPES\n }\n\n\ndef get_dbt_resource_props_by_test_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n return {\n dbt_resource_props["name"]: dbt_resource_props\n for unique_id, dbt_resource_props in manifest["nodes"].items()\n if unique_id.startswith("test")\n }\n
", "current_page_name": "_modules/dagster_dbt/core/resources_v2", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources_v2"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.types

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n This class is deprecated, because it's only produced by methods of the DbtCliClientResource class,\n which is deprecated in favor of DbtCliResource.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n docs_url (Optional[str]): Hostname where dbt docs are being served for this project.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: Sequence[Mapping[str, Any]],\n result: Mapping[str, Any],\n docs_url: Optional[str] = None,\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.sequence_param(logs, "logs", of_type=dict)\n self._docs_url = check.opt_str_param(docs_url, "docs_url")\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> Sequence[Mapping[str, Any]]:\n return self._logs\n\n @property\n def docs_url(self) -> Optional[str]:\n return self._docs_url
\n
", "current_page_name": "_modules/dagster_dbt/core/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.types"}}, "dagster_dbt_translator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dagster_dbt_translator

\nfrom dataclasses import dataclass\nfrom typing import Any, Mapping, Optional\n\nfrom dagster import AssetKey, AutoMaterializePolicy, FreshnessPolicy\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import (\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\n\nfrom .asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n)\n\n\n
[docs]@dataclass(frozen=True)\nclass DagsterDbtTranslatorSettings:\n """Settings to enable Dagster features for your dbt project.\n\n Args:\n enable_asset_checks (bool): Whether to load dbt tests as Dagster asset checks.\n Defaults to False.\n """\n\n enable_asset_checks: bool = False
\n\n\n
[docs]class DagsterDbtTranslator:\n """Holds a set of methods that derive Dagster asset definition metadata given a representation\n of a dbt resource (models, tests, sources, etc).\n\n This class is exposed so that methods can be overriden to customize how Dagster asset metadata\n is derived.\n """\n\n def __init__(self, settings: Optional[DagsterDbtTranslatorSettings] = None):\n """Initialize the translator.\n\n Args:\n settings (Optional[DagsterDbtTranslatorSettings]): Settings for the translator.\n """\n self._settings = settings or DagsterDbtTranslatorSettings()\n\n @property\n def settings(self) -> DagsterDbtTranslatorSettings:\n if not hasattr(self, "_settings"):\n self._settings = DagsterDbtTranslatorSettings()\n\n return self._settings\n\n
[docs] @classmethod\n @public\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster asset key that represents that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom asset key for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n AssetKey: The Dagster asset key for the dbt resource.\n\n Examples:\n Adding a prefix to the default asset key generated for each dbt resource:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n\n Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n asset_key = super().get_asset_key(dbt_resource_props)\n\n if dbt_resource_props["resource_type"] == "source":\n asset_key = asset_key.with_prefix("my_prefix")\n\n return asset_key\n """\n return default_asset_key_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster description for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom description for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n str: The description for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n return "custom description"\n """\n return default_description_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster metadata for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom metadata for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Mapping[str, Any]: A dictionary representing the Dagster metadata for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n return {"custom": "metadata"}\n """\n return default_metadata_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster group name for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom group name for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[str]: A Dagster group name.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n """\n return default_group_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.FreshnessPolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom freshness policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[FreshnessPolicy]: A Dagster freshness policy.\n\n Examples:\n Set a custom freshness policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n return FreshnessPolicy(maximum_lag_minutes=60)\n\n Set a custom freshness policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n freshness_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n return freshness_policy\n """\n return default_freshness_policy_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.AutoMaterializePolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom auto-materialize policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[AutoMaterializePolicy]: A Dagster auto-materialize policy.\n\n Examples:\n Set a custom auto-materialize policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n return AutoMaterializePolicy.eager()\n\n Set a custom auto-materialize policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n auto_materialize_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n auto_materialize_policy = AutoMaterializePolicy.eager()\n\n return auto_materialize_policy\n\n """\n return default_auto_materialize_policy_fn(dbt_resource_props)
\n\n\nclass KeyPrefixDagsterDbtTranslator(DagsterDbtTranslator):\n """A DagsterDbtTranslator that applies prefixes to the asset keys generated from dbt resources.\n\n Attributes:\n asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt models,\n seeds, snapshots, etc. This will *not* apply to dbt sources.\n source_asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt\n sources.\n """\n\n def __init__(\n self,\n asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *args,\n **kwargs,\n ):\n self._asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(asset_key_prefix, "asset_key_prefix")\n or []\n )\n self._source_asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(\n source_asset_key_prefix, "source_asset_key_prefix"\n )\n or []\n )\n\n super().__init__(*args, **kwargs)\n\n @public\n def get_asset_key(self, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n base_key = default_asset_key_fn(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(self._source_asset_key_prefix)\n else:\n return base_key.with_prefix(self._asset_key_prefix)\n\n\n@dataclass\nclass DbtManifestWrapper:\n manifest: Mapping[str, Any]\n
", "current_page_name": "_modules/dagster_dbt/dagster_dbt_translator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dagster_dbt_translator"}, "dbt_manifest_asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_manifest_asset_selection

\nfrom typing import AbstractSet, Optional\n\nfrom dagster import (\n    AssetKey,\n    AssetSelection,\n    _check as check,\n)\nfrom dagster._core.definitions.asset_graph import AssetGraph\n\nfrom .asset_utils import is_non_asset_node\nfrom .dagster_dbt_translator import DagsterDbtTranslator\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]class DbtManifestAssetSelection(AssetSelection):\n """Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.\n\n Args:\n manifest (Mapping[str, Any]): The dbt manifest blob.\n select (str): A dbt selection string to specify a set of dbt resources.\n exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Examples:\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster_dbt import DbtManifestAssetSelection\n\n manifest = json.loads(Path("path/to/manifest.json").read_text())\n\n # select the dbt assets that have the tag "foo".\n my_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n """\n\n def __init__(\n self,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n *,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n exclude: Optional[str] = None,\n ) -> None:\n self.manifest = validate_manifest(manifest)\n self.select = check.str_param(select, "select")\n self.exclude = check.opt_str_param(exclude, "exclude", default="")\n self.dagster_dbt_translator = check.opt_inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n DagsterDbtTranslator(),\n )\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n dbt_nodes = get_dbt_resource_props_by_dbt_unique_id_from_manifest(self.manifest)\n\n keys = set()\n for unique_id in select_unique_ids_from_manifest(\n select=self.select,\n exclude=self.exclude,\n manifest_json=self.manifest,\n ):\n dbt_resource_props = dbt_nodes[unique_id]\n is_dbt_asset = dbt_resource_props["resource_type"] in ASSET_RESOURCE_TYPES\n if is_dbt_asset and not is_non_asset_node(dbt_resource_props):\n asset_key = self.dagster_dbt_translator.get_asset_key(dbt_resource_props)\n keys.add(asset_key)\n\n return keys
\n
", "current_page_name": "_modules/dagster_dbt/dbt_manifest_asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_manifest_asset_selection"}, "dbt_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\nclass DbtClient:\n    """Base class for a client allowing users to interface with dbt."""\n\n    def __init__(\n        self,\n        logger: Optional[logging.Logger] = None,\n    ):\n        """Constructor.\n\n        Args:\n            logger (Optional[Any]): A property for injecting a logger dependency.\n                Default is ``None``.\n        """\n        self._logger = logger or get_dagster_logger()\n\n    def _format_params(\n        self, flags: Mapping[str, Any], replace_underscores: bool = False\n    ) -> Mapping[str, Any]:\n        """Reformats arguments that are easier to express as a list into the format that dbt expects,\n        and deletes and keys with no value.\n        """\n        # remove any keys with a value of None\n        if replace_underscores:\n            flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n        else:\n            flags = {k: v for k, v in flags.items() if v is not None}\n\n        for param in ["select", "exclude", "models"]:\n            if param in flags:\n                if isinstance(flags[param], list):\n                    # if it's a list, format as space-separated\n                    flags[param] = " ".join(set(flags[param]))\n\n        return flags\n\n    @property\n    def logger(self) -> logging.Logger:\n        """logging.Logger: A property for injecting a logger dependency."""\n        return self._logger\n\n    @abstractmethod\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n        raise NotImplementedError()\n\n    @abstractmethod\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n                for this dbt project.\n        """\n\n    @abstractmethod\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n\n\n
[docs]class DbtResource(DbtClient):\n pass
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.errors

\nimport warnings\nfrom abc import ABC\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Failure,\n    MetadataValue,\n    _check as check,\n)\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: Sequence[int]\n\n def __init__(self, invalid_line_nos: Sequence[int]):\n check.sequence_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata = {\n "Invalid CLI Output Line Numbers": MetadataValue.json({"line_nos": invalid_line_nos})\n }\n super().__init__(description, metadata=metadata)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(\n self,\n description: str,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n if logs is not None:\n warnings.warn(\n "`logs` is a deprecated argument to DagsterDbtCliRuntimeError and will be discarded"\n )\n if raw_output is not None:\n warnings.warn(\n "`raw_output` is a deprecated argument to DagsterDbtCliRuntimeError and will be"\n " discarded"\n )\n metadata = {"Parsed CLI Messages": "\\n".join(messages or [])}\n super().__init__(description, metadata=metadata)
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output, messages)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__(\n "Fatal error in the dbt CLI (return code 2): " + " ".join(messages or []),\n logs,\n raw_output,\n messages,\n )
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__(f"Expected to find file at path {path}")
\n\n\nclass DagsterDbtCloudJobInvariantViolationError(DagsterDbtError, DagsterInvariantViolationError):\n """Represents an error when a dbt Cloud job is not supported by the ``dagster-dbt`` library."""\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom .types import DbtOutput\nfrom .utils import generate_events, generate_materializations\n\n_DEFAULT_OP_PROPS: Dict[str, Any] = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n    """\n\n\n# NOTE: mypy fails to properly track the type of `_DEFAULT_OP_PROPS` items when they are\n# double-splatted, so we type-ignore the below op declarations.\n\n\nclass DbtBuildOpConfig(Config):\n    yield_asset_events: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations and asset observations corresponding to the results of "\n            "the dbt operation will be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n@op(**_DEFAULT_OP_PROPS)\ndef dbt_build_op(context, config: DbtBuildOpConfig) -> Any:\n    dbt_output = context.resources.dbt.build()\n    if config.yield_asset_events and "results" in dbt_output.result:\n        yield from generate_events(\n            dbt_output,\n            node_info_to_asset_key=lambda info: config.asset_key_prefix\n            + info["unique_id"].split("."),\n            manifest_json=context.resources.dbt.get_manifest_json(),\n        )\n    yield Output(dbt_output)\n\n\nclass DbtRunOpConfig(Config):\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: Optional[List[str]] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_run_op(context, config: DbtRunOpConfig):\n dbt_output = context.resources.dbt.run()\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor dbt_op, cmd in [\n (dbt_build_op, "build"),\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n dbt_op.__doc__ = _get_doc(dbt_op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.ops"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\n\n\n
[docs]class DbtOutput:\n """Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Mapping[str, Any]):\n self._result = check.mapping_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Mapping[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.utils

\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    MetadataValue,\n    Output,\n    _check as check,\n)\nfrom dagster._core.definitions.metadata import RawMetadataValue\n\nfrom .types import DbtOutput\n\n# dbt resource types that may be considered assets\nASSET_RESOURCE_TYPES = ["model", "seed", "snapshot"]\n\n\ndef default_node_info_to_asset_key(node_info: Mapping[str, Any]) -> AssetKey:\n    return AssetKey(node_info["unique_id"].split("."))\n\n\ndef _resource_type(unique_id: str) -> str:\n    # returns the type of the node (e.g. model, test, snapshot)\n    return unique_id.split(".")[0]\n\n\ndef input_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # * can be present when sources are sharded tables\n    return dbt_resource_props["unique_id"].replace(".", "_").replace("*", "_star")\n\n\ndef output_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # hyphens are valid in dbt model names, but not in output names\n    return dbt_resource_props["unique_id"].split(".")[-1].replace("-", "_")\n\n\ndef _node_result_to_metadata(node_result: Mapping[str, Any]) -> Mapping[str, RawMetadataValue]:\n    return {\n        "Materialization Strategy": node_result["config"]["materialized"],\n        "Database": node_result["database"],\n        "Schema": node_result["schema"],\n        "Alias": node_result["alias"],\n        "Description": node_result["description"],\n    }\n\n\ndef _timing_to_metadata(timings: Sequence[Mapping[str, Any]]) -> Mapping[str, RawMetadataValue]:\n    metadata: Dict[str, RawMetadataValue] = {}\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        # dateutil does not properly expose its modules to static checkers\n        started_at = dateutil.parser.isoparse(timing["started_at"])  # type: ignore\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                f"{desc} Started At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Completed At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Duration": duration.total_seconds(),\n            }\n        )\n    return metadata\n\n\ndef result_to_events(\n    result: Mapping[str, Any],\n    docs_url: Optional[str] = None,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n    extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n    generate_asset_outputs: bool = False,\n) -> Iterator[Union[AssetMaterialization, AssetObservation, Output]]:\n    """This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n    node_info_to_asset_key = check.opt_callable_param(\n        node_info_to_asset_key, "node_info_to_asset_key", default=default_node_info_to_asset_key\n    )\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        status = (\n            "fail"\n            if result.get("fail")\n            else "skip" if result.get("skip") else "error" if result.get("error") else "success"\n        )\n    else:\n        status = result["status"]\n\n    # all versions represent timing the same way\n    metadata = {"Status": status, "Execution Time (seconds)": result["execution_time"]}\n    metadata.update(_timing_to_metadata(result["timing"]))\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n        unique_id = result["node"]["unique_id"]\n        metadata.update(_node_result_to_metadata(result["node"]))\n    else:\n        unique_id = result["unique_id"]\n\n    if docs_url:\n        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")\n\n    if extra_metadata:\n        metadata.update(extra_metadata)\n\n    # if you have a manifest available, get the full node info, otherwise just populate unique_id\n    dbt_resource_props = (\n        manifest_json["nodes"][unique_id] if manifest_json else {"unique_id": unique_id}\n    )\n\n    node_resource_type = _resource_type(unique_id)\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and status == "success":\n        if generate_asset_outputs:\n            yield Output(\n                value=None,\n                output_name=output_name_fn(dbt_resource_props),\n                metadata=metadata,\n            )\n        else:\n            yield AssetMaterialization(\n                asset_key=node_info_to_asset_key(dbt_resource_props),\n                description=f"dbt node: {unique_id}",\n                metadata=metadata,\n            )\n    # can only associate tests with assets if we have manifest_json available\n    elif node_resource_type == "test" and manifest_json and status != "skipped":\n        upstream_unique_ids = manifest_json["nodes"][unique_id]["depends_on"]["nodes"]\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            dbt_resource_props = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if dbt_resource_props is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(dbt_resource_props)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": result["unique_id"],\n                    "Test Status": status,\n                    "Test Message": result.get("message") or "",\n                },\n            )\n\n\ndef generate_events(\n    dbt_output: DbtOutput,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n) -> Iterator[Union[AssetMaterialization, AssetObservation]]:\n    """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n    a dbt command, and :py:class:`dagster.AssetObservation` events for each test run.\n\n    Information parsed from a :py:class:`~DbtOutput` object.\n    """\n    for result in dbt_output.result["results"]:\n        for event in result_to_events(\n            result,\n            docs_url=dbt_output.docs_url,\n            node_info_to_asset_key=node_info_to_asset_key,\n            manifest_json=manifest_json,\n        ):\n            yield check.inst(\n                cast(Union[AssetMaterialization, AssetObservation], event),\n                (AssetMaterialization, AssetObservation),\n            )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput,\n asset_key_prefix: Optional[Sequence[str]] = None,\n) -> Iterator[AssetMaterialization]:\n """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n a dbt command.\n\n Information parsed from a :py:class:`~DbtOutput` object.\n\n Examples:\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n """\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for event in generate_events(\n dbt_output,\n node_info_to_asset_key=lambda info: AssetKey(\n asset_key_prefix + info["unique_id"].split(".")\n ),\n ):\n yield check.inst(cast(AssetMaterialization, event), AssetMaterialization)
\n\n\ndef select_unique_ids_from_manifest(\n select: str,\n exclude: str,\n state_path: Optional[str] = None,\n manifest_json_path: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n manifest_parsed: Optional[Any] = None,\n) -> AbstractSet[str]:\n """Method to apply a selection string to an existing manifest.json file."""\n import dbt.graph.cli as graph_cli\n import dbt.graph.selector as graph_selector\n from dbt.contracts.graph.manifest import Manifest, WritableManifest\n from dbt.contracts.state import PreviousState\n from dbt.graph.selector_spec import IndirectSelection, SelectionSpec\n from networkx import DiGraph\n\n if state_path is not None:\n previous_state = PreviousState(\n path=Path(state_path), # type: ignore # (unused path, slated for deletion)\n current_path=( # type: ignore # (unused path, slated for deletion)\n Path("/tmp/null") if manifest_json_path is None else Path(manifest_json_path)\n ),\n )\n else:\n previous_state = None\n\n if manifest_json_path is not None:\n manifest = WritableManifest.read_and_check_versions(manifest_json_path)\n child_map = manifest.child_map\n elif manifest_json is not None:\n\n class _DictShim(dict):\n """Shim to enable hydrating a dictionary into a dot-accessible object."""\n\n def __getattr__(self, item):\n ret = super().get(item)\n # allow recursive access e.g. foo.bar.baz\n return _DictShim(ret) if isinstance(ret, dict) else ret\n\n manifest = Manifest(\n # dbt expects dataclasses that can be accessed with dot notation, not bare dictionaries\n nodes={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["nodes"].items() # type: ignore\n },\n sources={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["sources"].items() # type: ignore\n },\n metrics={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["metrics"].items() # type: ignore\n },\n exposures={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["exposures"].items() # type: ignore\n },\n )\n child_map = manifest_json["child_map"]\n elif manifest_parsed is not None:\n manifest = manifest_parsed\n child_map = manifest.child_map\n else:\n check.failed("Must provide either a manifest_json_path, manifest_json, or manifest_parsed.")\n graph = graph_selector.Graph(DiGraph(incoming_graph_data=child_map))\n\n # create a parsed selection from the select string\n try:\n from dbt.flags import GLOBAL_FLAGS\n except ImportError:\n # dbt < 1.5.0 compat\n import dbt.flags as GLOBAL_FLAGS\n setattr(GLOBAL_FLAGS, "INDIRECT_SELECTION", IndirectSelection.Eager)\n setattr(GLOBAL_FLAGS, "WARN_ERROR", True)\n parsed_spec: SelectionSpec = graph_cli.parse_union([select], True)\n\n if exclude:\n parsed_spec = graph_cli.SelectionDifference(\n components=[parsed_spec, graph_cli.parse_union([exclude], True)]\n )\n\n # execute this selection against the graph\n selector = graph_selector.NodeSelector(graph, manifest, previous_state=previous_state)\n selected, _ = selector.select_nodes(parsed_spec)\n return selected\n\n\ndef get_dbt_resource_props_by_dbt_unique_id_from_manifest(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n """A mapping of a dbt node's unique id to the node's dictionary representation in the manifest."""\n return {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["exposures"],\n **manifest["metrics"],\n }\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_executor

\nfrom typing import Iterator, Optional, cast\n\nimport dagster._check as check\nimport docker\nimport docker.errors\nfrom dagster import Field, IntSource, executor\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster._core.executor.step_delegating.step_handler.base import (\n    CheckStepHealthResult,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes.utils import hash_str\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n "max_concurrent": Field(\n IntSource,\n is_required=False,\n description=(\n "Limit on the number of containers that will run concurrently within the scope "\n "of a Dagster run. Note that this limit is per run, not global."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n config = init_context.executor_config\n image = check.opt_str_elem(config, "image")\n registry = check.opt_dict_elem(config, "registry", key_type=str)\n env_vars = check.opt_list_elem(config, "env_vars", of_type=str)\n network = check.opt_str_elem(config, "network")\n networks = check.opt_list_elem(config, "networks", of_type=str)\n container_kwargs = check.opt_dict_elem(config, "container_kwargs", key_type=str)\n retries = check.dict_elem(config, "retries", key_type=str)\n max_concurrent = check.opt_int_elem(config, "max_concurrent")\n tag_concurrency_limits = check.opt_list_elem(config, "tag_concurrency_limits")\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network and not networks:\n networks = [network]\n\n container_context = DockerContainerContext(\n registry=registry,\n env_vars=env_vars or [],\n networks=networks or [],\n container_kwargs=container_kwargs,\n )\n\n return StepDelegatingExecutor(\n DockerStepHandler(image, container_context),\n retries=check.not_none(RetryMode.from_config(retries)),\n max_concurrent=max_concurrent,\n tag_concurrency_limits=tag_concurrency_limits,\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image: Optional[str],\n container_context: DockerContainerContext,\n ):\n super().__init__()\n\n self._image = check.opt_str_param(image, "image")\n self._container_context = check.inst_param(\n container_context, "container_context", DockerContainerContext\n )\n\n def _get_image(self, step_handler_context: StepHandlerContext):\n from . import DockerRunLauncher\n\n image = cast(\n JobPythonOrigin, step_handler_context.dagster_run.job_code_origin\n ).repository_origin.container_image\n if not image:\n image = self._image\n\n run_launcher = step_handler_context.instance.run_launcher\n\n if not image and isinstance(run_launcher, DockerRunLauncher):\n image = run_launcher.image\n\n if not image:\n raise Exception("No docker image specified by the executor config or repository")\n\n return image\n\n def _get_docker_container_context(self, step_handler_context: StepHandlerContext):\n # This doesn't vary per step: would be good to have a hook where it can be set once\n # for the whole StepHandler but we need access to the DagsterRun for that\n\n from .docker_run_launcher import DockerRunLauncher\n\n run_launcher = step_handler_context.instance.run_launcher\n run_target = DockerContainerContext.create_for_run(\n step_handler_context.dagster_run,\n run_launcher if isinstance(run_launcher, DockerRunLauncher) else None,\n )\n\n merged_container_context = run_target.merge(self._container_context)\n\n validate_docker_config(\n network=None,\n networks=merged_container_context.networks,\n container_kwargs=merged_container_context.container_kwargs,\n )\n\n return merged_container_context\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self, docker_container_context: DockerContainerContext):\n client = docker.client.from_env()\n if docker_container_context.registry:\n client.login(\n registry=docker_container_context.registry["url"],\n username=docker_container_context.registry["username"],\n password=docker_container_context.registry["password"],\n )\n return client\n\n def _get_container_name(self, execute_step_args: ExecuteStepArgs):\n run_id = execute_step_args.run_id\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n step_name = f"dagster-step-{hash_str(run_id + step_key)}"\n\n if execute_step_args.known_state:\n retry_state = execute_step_args.known_state.get_retry_state()\n retry_number = retry_state.get_attempt_count(step_key)\n if retry_number:\n step_name = f"{step_name}-{retry_number}"\n\n return step_name\n\n def _create_step_container(\n self,\n client,\n container_context,\n step_image,\n step_handler_context: StepHandlerContext,\n ):\n execute_step_args = step_handler_context.execute_step_args\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n env_vars["DAGSTER_RUN_JOB_NAME"] = step_handler_context.dagster_run.job_name\n env_vars["DAGSTER_RUN_STEP_KEY"] = step_key\n return client.containers.create(\n step_image,\n name=self._get_container_name(execute_step_args),\n detach=True,\n network=container_context.networks[0] if len(container_context.networks) else None,\n command=execute_step_args.get_command_args(),\n environment=env_vars,\n **container_context.container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n step_image = self._get_image(step_handler_context)\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message="Launching step in Docker container.",\n metadata={\n "Docker container id": step_container.id,\n },\n )\n step_container.start()\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n container = client.containers.get(container_name)\n\n if container.status == "running":\n return CheckStepHealthResult.healthy()\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n raise Exception(\n f"Container status is {container.status}. Raised exception attempting to get its"\n " return code."\n ) from e\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return CheckStepHealthResult.healthy()\n\n return CheckStepHealthResult.unhealthy(\n reason=f"Container status is {container.status}. Return code is {ret_code}."\n )\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert (\n len(step_keys_to_execute) == 1\n ), "Terminating multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Stopping Docker container {container_name} for step.",\n event_specific_data=EngineEventData(),\n )\n\n client = self._get_client(container_context)\n\n container = client.containers.get(container_name)\n\n container.stop()\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_run_launcher

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\nimport docker\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom typing_extensions import Self\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def get_container_context(self, dagster_run: DagsterRun) -> DockerContainerContext:\n return DockerContainerContext.create_for_run(dagster_run, self)\n\n def _get_client(self, container_context: DockerContainerContext):\n client = docker.client.from_env()\n if container_context.registry:\n client.login(\n registry=container_context.registry["url"],\n username=container_context.registry["username"],\n password=container_context.registry["password"],\n )\n return client\n\n def _get_docker_image(self, job_code_origin):\n docker_image = job_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n container_context = self.get_container_context(run)\n docker_env = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n docker_env["DAGSTER_RUN_JOB_NAME"] = run.job_name\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message=f"Launching run in a new container {container.id} with image {docker_image}",\n dagster_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ExecuteRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ResumeRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n container_context = self.get_container_context(run)\n\n try:\n return self._get_client(container_context).containers.get(container_id)\n except Exception:\n return None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n container = self._get_container(run)\n if container is None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_run_launcher"}, "ops": {"docker_container_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.ops.docker_container_op

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport docker\nfrom dagster import Field, In, Nothing, OpExecutionContext, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._core.utils import parse_env_var\nfrom dagster._serdes.utils import hash_str\n\nfrom ..container_context import DockerContainerContext\nfrom ..docker_run_launcher import DockerRunLauncher\nfrom ..utils import DOCKER_CONFIG_SCHEMA, validate_docker_image\n\nDOCKER_CONTAINER_OP_CONFIG = {\n    **DOCKER_CONFIG_SCHEMA,\n    "image": Field(\n        StringSource,\n        is_required=True,\n        description="The image in which to run the Docker container.",\n    ),\n    "entrypoint": Field(\n        [str],\n        is_required=False,\n        description="The ENTRYPOINT for the Docker container",\n    ),\n    "command": Field(\n        [str],\n        is_required=False,\n        description="The command to run in the container within the launched Docker container.",\n    ),\n}\n\n\ndef _get_client(docker_container_context: DockerContainerContext):\n    client = docker.client.from_env()\n    if docker_container_context.registry:\n        client.login(\n            registry=docker_container_context.registry["url"],\n            username=docker_container_context.registry["username"],\n            password=docker_container_context.registry["password"],\n        )\n    return client\n\n\ndef _get_container_name(run_id, op_name, retry_number):\n    container_name = hash_str(run_id + op_name)\n\n    if retry_number > 0:\n        container_name = f"{container_name}-{retry_number}"\n\n    return container_name\n\n\ndef _create_container(\n    op_context: OpExecutionContext,\n    client,\n    container_context: DockerContainerContext,\n    image: str,\n    entrypoint: Optional[Sequence[str]],\n    command: Optional[Sequence[str]],\n):\n    env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n    return client.containers.create(\n        image,\n        name=_get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),\n        detach=True,\n        network=container_context.networks[0] if len(container_context.networks) else None,\n        entrypoint=entrypoint,\n        command=command,\n        environment=env_vars,\n        **container_context.container_kwargs,\n    )\n\n\n
[docs]@experimental\ndef execute_docker_container(\n context: OpExecutionContext,\n image: str,\n entrypoint: Optional[Sequence[str]] = None,\n command: Optional[Sequence[str]] = None,\n networks: Optional[Sequence[str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n env_vars: Optional[Sequence[str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n):\n """This function is a utility for executing a Docker container from within a Dagster op.\n\n Args:\n image (str): The image to use for the launched Docker container.\n entrypoint (Optional[Sequence[str]]): The ENTRYPOINT to run in the launched Docker\n container. Default: None.\n command (Optional[Sequence[str]]): The CMD to run in the launched Docker container.\n Default: None.\n networks (Optional[Sequence[str]]): Names of the Docker networks to which to connect the\n launched container. Default: None.\n registry: (Optional[Mapping[str, str]]): Information for using a non local/public Docker\n registry. Can have "url", "username", or "password" keys.\n env_vars (Optional[Sequence[str]]): List of environemnt variables to include in the launched\n container. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\n pulled from the calling environment.\n container_kwargs (Optional[Dict[str[Any]]]): key-value pairs that can be passed into\n containers.create in the Docker Python API. See\n https://docker-py.readthedocs.io/en/stable/containers.html for the full list\n of available options.\n """\n run_container_context = DockerContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, DockerRunLauncher)\n else None\n ),\n )\n\n validate_docker_image(image)\n\n op_container_context = DockerContainerContext(\n registry=registry, env_vars=env_vars, networks=networks, container_kwargs=container_kwargs\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n client = _get_client(container_context)\n\n try:\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n container.start()\n\n for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n print(line) # noqa: T201\n\n exit_status = container.wait()["StatusCode"]\n\n if exit_status != 0:\n raise Exception(f"Docker container returned exit code {exit_status}")
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=DOCKER_CONTAINER_OP_CONFIG)\n@experimental\ndef docker_container_op(context):\n """An op that runs a Docker container using the docker Python API.\n\n Contrast with the `docker_executor`, which runs each Dagster op in a Dagster job in its\n own Docker container.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in docker.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_docker_container_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_docker_container` function\n inside your own op.\n """\n execute_docker_container(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_docker/ops/docker_container_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.ops.docker_container_op"}}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.pipes

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Union\n\nimport docker\nfrom dagster import (\n    OpExecutionContext,\n    ResourceParam,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n)\nfrom dagster._core.pipes.utils import (\n    PipesEnvContextInjector,\n    extract_message_or_forward_to_stdout,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    DagsterPipesError,\n    PipesDefaultMessageWriter,\n    PipesExtras,\n    PipesParams,\n)\n\n\n
[docs]@experimental\nclass PipesDockerLogsMessageReader(PipesMessageReader):\n @contextmanager\n def read_messages(\n self,\n handler: PipesMessageHandler,\n ) -> Iterator[PipesParams]:\n self._handler = handler\n try:\n yield {PipesDefaultMessageWriter.STDIO_KEY: PipesDefaultMessageWriter.STDERR}\n finally:\n self._handler = None\n\n def consume_docker_logs(self, container) -> None:\n handler = check.not_none(\n self._handler, "Can only consume logs within context manager scope."\n )\n for log_line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n if isinstance(log_line, bytes):\n log_entry = log_line.decode("utf-8")\n elif isinstance(log_line, str):\n log_entry = log_line\n else:\n continue\n\n extract_message_or_forward_to_stdout(handler, log_entry)\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages by extracting them from docker logs directly."
\n\n\n@experimental\nclass _PipesDockerClient(PipesClient):\n """A pipes client that runs external processes in docker containers.\n\n By default context is injected via environment variables and messages are parsed out of the\n log stream, with other logs forwarded to stdout of the orchestration process.\n\n Args:\n env (Optional[Mapping[str, str]]): An optional dict of environment variables to pass to the\n container.\n register (Optional[Mapping[str, str]]): An optional dict of registry credentials to login to\n the docker client.\n context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n context into the docker container process. Defaults to :py:class:`PipesEnvContextInjector`.\n message_reader (Optional[PipesContextInjector]): A message reader to use to read messages\n from the docker container process. Defaults to :py:class:`DockerLogsMessageReader`.\n """\n\n def __init__(\n self,\n env: Optional[Mapping[str, str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n context_injector: Optional[PipesContextInjector] = None,\n message_reader: Optional[PipesMessageReader] = None,\n ):\n self.env = check.opt_mapping_param(env, "env", key_type=str, value_type=str)\n self.registry = check.opt_mapping_param(registry, "registry", key_type=str, value_type=str)\n self.context_injector = (\n check.opt_inst_param(\n context_injector,\n "context_injector",\n PipesContextInjector,\n )\n or PipesEnvContextInjector()\n )\n\n self.message_reader = (\n check.opt_inst_param(message_reader, "message_reader", PipesMessageReader)\n or PipesDockerLogsMessageReader()\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def run(\n self,\n *,\n context: OpExecutionContext,\n image: str,\n extras: Optional[PipesExtras] = None,\n command: Optional[Union[str, Sequence[str]]] = None,\n env: Optional[Mapping[str, str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n ) -> PipesClientCompletedInvocation:\n """Create a docker container and run it to completion, enriched with the pipes protocol.\n\n Args:\n image (str):\n The image for the container to use.\n command (Optional[Union[str, Sequence[str]]]):\n The command for the container use.\n env (Optional[Mapping[str,str]]):\n A mapping of environment variable names to values to set on the first\n container in the pod spec, on top of those configured on resource.\n registry (Optional[Mapping[str, str]]:\n A mapping containing url, username, and password to be used\n with docker client login.\n container_kwargs (Optional[Mapping[str, Any]]:\n Arguments to be forwarded to docker client containers.create.\n extras (Optional[PipesExtras]):\n Extra values to pass along as part of the ext protocol.\n context_injector (Optional[PipesContextInjector]):\n Override the default ext protocol context injection.\n message_reader (Optional[PipesMessageReader]):\n Override the default ext protocol message reader.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """\n with open_pipes_session(\n context=context,\n context_injector=self.context_injector,\n message_reader=self.message_reader,\n extras=extras,\n ) as pipes_session:\n client = docker.client.from_env()\n registry = registry or self.registry\n if registry:\n client.login(\n registry=registry["url"],\n username=registry["username"],\n password=registry["password"],\n )\n\n try:\n container = self._create_container(\n client=client,\n image=image,\n command=command,\n env=env,\n open_pipes_session_env=pipes_session.get_bootstrap_env_vars(),\n container_kwargs=container_kwargs,\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = self._create_container(\n client=client,\n image=image,\n command=command,\n env=env,\n open_pipes_session_env=pipes_session.get_bootstrap_env_vars(),\n container_kwargs=container_kwargs,\n )\n\n result = container.start()\n try:\n if isinstance(self.message_reader, PipesDockerLogsMessageReader):\n self.message_reader.consume_docker_logs(container)\n\n result = container.wait()\n if result["StatusCode"] != 0:\n raise DagsterPipesError(f"Container exited with non-zero status code: {result}")\n finally:\n container.stop()\n return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n def _create_container(\n self,\n client,\n image: str,\n command: Optional[Union[str, Sequence[str]]],\n env: Optional[Mapping[str, str]],\n container_kwargs: Optional[Mapping[str, Any]],\n open_pipes_session_env: Mapping[str, str],\n ):\n kwargs = dict(container_kwargs or {})\n kwargs_env = kwargs.pop("environment", {})\n return client.containers.create(\n image=image,\n command=command,\n detach=True,\n environment={\n **open_pipes_session_env,\n **(self.env or {}),\n **(env or {}),\n **kwargs_env,\n },\n **kwargs,\n )\n\n\nPipesDockerClient = ResourceParam[_PipesDockerClient]\n
", "current_page_name": "_modules/dagster_docker/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.pipes"}}, "dagster_duckdb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Optional, Sequence, Type, cast\n\nimport duckdb\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\nDUCKDB_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_duckdb_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n DuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import build_duckdb_io_manager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n duckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\n specified by including a "schema" entry in output metadata. If none of these is provided, the schema will\n default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=DuckDBIOManager.to_config_schema())\n def duckdb_io_manager(init_context):\n """IO Manager for storing outputs in a DuckDB database.\n\n Assets will be stored in the schema and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the schema specified by output metadata (defaults to public) in a\n table of the name of the output.\n """\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=DuckDbClient(),\n io_manager_name="DuckDBIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return duckdb_io_manager
\n\n\n
[docs]class DuckDBIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If none\n of these is provided, the schema will default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n Set DuckDB configuration options using the config field. See\n https://duckdb.org/docs/sql/configuration.html for all available settings.\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb",\n config={"arrow_large_buffer_size": True})}\n )\n\n """\n\n database: str = Field(description="Path to the DuckDB database.")\n config: Dict[str, Any] = Field(description="DuckDB configuration options.", default={})\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=DuckDbClient(),\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n io_manager_name="DuckDBIOManager",\n )
\n\n\nclass DuckDbClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except duckdb.CatalogException:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.execute(f"create schema if not exists {table_slice.schema};")\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"SELECT {col_str} FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.schema}.{table_slice.table}"""\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={\n "database": context.resource_config["database"],\n "read_only": False,\n "config": context.resource_config["config"],\n },\n max_retries=10,\n )\n\n yield conn\n\n conn.close()\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"DELETE FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(DUCKDB_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(DUCKDB_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_duckdb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.resource

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict\n\nimport duckdb\nfrom dagster import ConfigurableResource\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\n\n
[docs]class DuckDBResource(ConfigurableResource):\n """Resource for interacting with a DuckDB database.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_duckdb import DuckDBResource\n\n @asset\n def my_table(duckdb: DuckDBResource):\n with duckdb.get_connection() as conn:\n conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\n defs = Definitions(\n assets=[my_table],\n resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n )\n\n """\n\n database: str = Field(\n description=(\n "Path to the DuckDB database. Setting database=':memory:' will use an in-memory"\n " database "\n )\n )\n config: Dict[str, Any] = Field(description="DuckDB configuration options.", default={})\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_connection(self):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": self.database, "read_only": False, "config": self.config},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()
\n
", "current_page_name": "_modules/dagster_duckdb/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.resource"}}, "dagster_duckdb_pandas": {"duckdb_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pandas.duckdb_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\n\n\n
[docs]class DuckDBPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Stores and loads Pandas DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in duckdb."""\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n return connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nduckdb_pandas_io_manager = build_duckdb_io_manager(\n [DuckDBPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nduckdb_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPandasIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\n using the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pandas/duckdb_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pandas.duckdb_pandas_type_handler"}}, "dagster_duckdb_polars": {"duckdb_polars_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_polars.duckdb_polars_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport polars as pl\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager\n\n\n
[docs]class DuckDBPolarsTypeHandler(DbTypeHandler[pl.DataFrame]):\n """Stores and loads Polars DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pl.DataFrame, connection\n ):\n """Stores the polars DataFrame in duckdb."""\n obj_arrow = obj.to_arrow() # noqa: F841 # need obj_arrow symbol to exist for duckdb query\n connection.execute(f"create schema if not exists {table_slice.schema};")\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj_arrow;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj_arrow"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype))\n for name, dtype in zip(obj.columns, obj.dtypes)\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pl.DataFrame:\n """Loads the input as a Polars DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pl.DataFrame()\n select_statement = connection.execute(\n DuckDbClient.get_select_statement(table_slice=table_slice)\n )\n duckdb_to_arrow = select_statement.arrow()\n return pl.DataFrame(duckdb_to_arrow)\n\n @property\n def supported_types(self):\n return [pl.DataFrame]
\n\n\nduckdb_polars_io_manager = build_duckdb_io_manager(\n [DuckDBPolarsTypeHandler()], default_load_type=pl.DataFrame\n)\nduckdb_polars_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_polars import duckdb_polars_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPolarsIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\n using the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\n as Polars DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pl.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_polars/duckdb_polars_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_polars.duckdb_polars_type_handler"}}, "dagster_duckdb_pyspark": {"duckdb_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pyspark.duckdb_pyspark_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pyarrow as pa\nimport pyspark\nimport pyspark.sql\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef pyspark_df_to_arrow_table(df: pyspark.sql.DataFrame) -> pa.Table:\n    """Converts a PySpark DataFrame to a PyArrow Table."""\n    # `_collect_as_arrow` API call sourced from:\n    #   https://stackoverflow.com/questions/73203318/how-to-transform-spark-dataframe-to-polars-dataframe\n    return pa.Table.from_batches(df._collect_as_arrow())  # noqa: SLF001\n\n\n
[docs]class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):\n """Stores PySpark DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n """\n\n def handle_output(\n self,\n context: OutputContext,\n table_slice: TableSlice,\n obj: pyspark.sql.DataFrame,\n connection,\n ):\n """Stores the given object at the provided filepath."""\n pa_df = pyspark_df_to_arrow_table(obj) # noqa: F841\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " pa_df;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from pa_df;"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.count(),\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) for name, dtype in obj.dtypes\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pyspark.sql.DataFrame:\n """Loads the return of the query as the correct type."""\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n pd_df = connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n return spark.createDataFrame(pd_df)\n\n @property\n def supported_types(self):\n return [pyspark.sql.DataFrame]
\n\n\nduckdb_pyspark_io_manager = build_duckdb_io_manager(\n [DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame\n)\nduckdb_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPySparkIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\n using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pyspark.sql.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pyspark/duckdb_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pyspark.duckdb_pyspark_type_handler"}}, "dagster_embedded_elt": {"sling": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.asset_defs

\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom dagster import (\n    AssetExecutionContext,\n    AssetsDefinition,\n    AssetSpec,\n    MaterializeResult,\n    multi_asset,\n)\nfrom dagster._annotations import experimental\n\nfrom dagster_embedded_elt.sling.resources import SlingMode, SlingResource\n\n\n
[docs]@experimental\ndef build_sling_asset(\n asset_spec: AssetSpec,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[Union[str, List[str]]] = None,\n update_key: Optional[Union[str, List[str]]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n sling_resource_key: str = "sling",\n) -> AssetsDefinition:\n """Asset Factory for using Sling to sync data from a source stream to a target object.\n\n Args:\n asset_spec (AssetSpec): The AssetSpec to use to materialize this asset.\n source_stream (str): The source stream to sync from. This can be a table, a query, or a path.\n target_object (str): The target object to sync to. This can be a table, or a path.\n mode (SlingMode, optional): The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.\n primary_key (Optional[Union[str, List[str]]], optional): The optional primary key to use when syncing.\n update_key (Optional[Union[str, List[str]]], optional): The optional update key to use when syncing.\n source_options (Optional[Dict[str, Any]], optional): Any optional Sling source options to use when syncing.\n target_options (Optional[Dict[str, Any]], optional): Any optional target options to use when syncing.\n sling_resource_key (str, optional): The resource key for the SlingResource. Defaults to "sling".\n\n Examples:\n Creating a Sling asset that syncs from a file to a table:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key=["main", "dest_tbl"])\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="file:///tmp/test.csv",\n target_object="main.dest_table",\n mode=SlingMode.INCREMENTAL,\n primary_key="id"\n )\n\n Creating a Sling asset that syncs from a table to a file with a full refresh:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key="test.csv")\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="main.dest_table",\n target_object="file:///tmp/test.csv",\n mode=SlingMode.FULL_REFRESH\n )\n\n\n """\n if primary_key is not None and not isinstance(primary_key, list):\n primary_key = [primary_key]\n\n if update_key is not None and not isinstance(update_key, list):\n update_key = [update_key]\n\n @multi_asset(\n compute_kind="sling", specs=[asset_spec], required_resource_keys={sling_resource_key}\n )\n def sync(context: AssetExecutionContext) -> MaterializeResult:\n sling: SlingResource = getattr(context.resources, sling_resource_key)\n last_row_count_observed = None\n for stdout_line in sling.sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n ):\n match = re.search(r"(\\d+) rows", stdout_line)\n if match:\n last_row_count_observed = int(match.group(1))\n context.log.info(stdout_line)\n\n return MaterializeResult(\n metadata=(\n {} if last_row_count_observed is None else {"row_count": last_row_count_observed}\n )\n )\n\n return sync
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.asset_defs"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.resources

\nimport contextlib\nimport json\nimport re\nfrom enum import Enum\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Any, Dict, Generator, List, Optional\n\nfrom dagster import ConfigurableResource, PermissiveConfig, get_dagster_logger\nfrom dagster._annotations import experimental\nfrom dagster._utils.env import environ\nfrom pydantic import Field\nfrom sling import Sling\n\nlogger = get_dagster_logger()\n\n\nclass SlingMode(str, Enum):\n    """The mode to use when syncing.\n\n    See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n    """\n\n    INCREMENTAL = "incremental"\n    TRUNCATE = "truncate"\n    FULL_REFRESH = "full-refresh"\n    SNAPSHOT = "snapshot"\n\n\n
[docs]class SlingSourceConnection(PermissiveConfig):\n """A Sling Source Connection defines the source connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Source for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingSourceConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n source = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema")\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n """\n\n type: str = Field(description="Type of the source connection. Use 'file' for local storage.")\n connection_string: Optional[str] = Field(\n description="The connection string for the source database."\n )
\n\n\n
[docs]class SlingTargetConnection(PermissiveConfig):\n """A Sling Target Connection defines the target connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Target for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block::python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n\n """\n\n type: str = Field(\n description="Type of the destination connection. Use 'file' for local storage."\n )\n connection_string: Optional[str] = Field(\n description="The connection string for the target database."\n )
\n\n\n
[docs]@experimental\nclass SlingResource(ConfigurableResource):\n """Resource for interacting with the Sling package.\n\n Examples:\n .. code-block:: python\n\n from dagster_etl.sling import SlingResource\n sling_resource = SlingResource(\n source_connection=SlingSourceConnection(\n type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n ),\n target_connection=SlingTargetConnection(\n type="snowflake",\n host="host",\n user="user",\n database="database",\n password="password",\n role="role",\n ),\n )\n\n """\n\n source_connection: SlingSourceConnection\n target_connection: SlingTargetConnection\n\n @contextlib.contextmanager\n def _setup_config(self) -> Generator[None, None, None]:\n """Uses environment variables to set the Sling source and target connections."""\n sling_source = self.source_connection.dict()\n sling_target = self.target_connection.dict()\n if self.source_connection.connection_string:\n sling_source["url"] = self.source_connection.connection_string\n if self.target_connection.connection_string:\n sling_target["url"] = self.target_connection.connection_string\n with environ(\n {\n "SLING_SOURCE": json.dumps(sling_source),\n "SLING_TARGET": json.dumps(sling_target),\n }\n ):\n yield\n\n @staticmethod\n def _exec_sling_cmd(cmd, stdin=None, stdout=PIPE, stderr=STDOUT) -> Generator[str, None, None]:\n ansi_escape = re.compile(r"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])")\n with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:\n assert proc.stdout\n\n for line in proc.stdout:\n fmt_line = str(line, "utf-8")\n clean_line = ansi_escape.sub("", fmt_line).replace("INF", "")\n yield clean_line\n\n proc.wait()\n if proc.returncode != 0:\n raise Exception("Sling command failed with error code %s", proc.returncode)\n\n def _sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Runs a Sling sync from the given source table to the given destination table. Generates\n output lines from the Sling CLI.\n """\n if self.source_connection.type == "file" and not source_stream.startswith("file://"):\n source_stream = "file://" + source_stream\n\n if self.target_connection.type == "file" and not target_object.startswith("file://"):\n target_object = "file://" + target_object\n\n with self._setup_config():\n config = {\n "source": {\n "conn": "SLING_SOURCE",\n "stream": source_stream,\n "primary_key": primary_key,\n "update_key": update_key,\n "options": source_options,\n },\n "target": {\n "conn": "SLING_TARGET",\n "object": target_object,\n "options": target_options,\n },\n }\n config["source"] = {k: v for k, v in config["source"].items() if v is not None}\n config["target"] = {k: v for k, v in config["target"].items() if v is not None}\n\n sling_cli = Sling(**config)\n logger.info("Starting Sling sync with mode: %s", mode)\n cmd = sling_cli._prep_cmd() # noqa: SLF001\n\n yield from self._exec_sling_cmd(cmd)\n\n def sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Initiate a Sling Sync between a source stream and a target object.\n\n Args:\n source_stream (str): The source stream to read from. For database sources, the source stream can be either\n a table name, a SQL statement or a path to a SQL file e.g. `TABLE1` or `SCHEMA1.TABLE2` or\n `SELECT * FROM TABLE`. For file sources, the source stream is a path or an url to a file.\n For file targets, the target object is a path or a url to a file, e.g. file:///tmp/file.csv or\n s3://my_bucket/my_folder/file.csv\n target_object (str): The target object to write into. For database targets, the target object is a table\n name, e.g. TABLE1, SCHEMA1.TABLE2. For file targets, the target object is a path or an url to a file.\n mode (SlingMode): The Sling mode to use when syncing, i.e. incremental, full-refresh\n See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n primary_key (str): For incremental syncs, a primary key is used during merge statements to update\n existing rows.\n update_key (str): For incremental syncs, an update key is used to stream records after max(update_key)\n source_options (Dict[str, Any]): Other source options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#source-options-src-options-flag-source.options-key\n for details\n target_options (Dict[str, Any[): Other target options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#target-options-tgt-options-flag-target.options-key\n for details\n\n Examples:\n Sync from a source file to a sqlite database:\n\n .. code-block:: python\n\n sqllite_path = "/path/to/sqlite.db"\n csv_path = "/path/to/file.csv"\n\n @asset\n def run_sync(context, sling: SlingResource):\n res = sling.sync(\n source_stream=csv_path,\n target_object="events",\n mode=SlingMode.FULL_REFRESH,\n )\n for stdout in res:\n context.log.debug(stdout)\n counts = sqlite3.connect(sqllitepath).execute("SELECT count(1) FROM events").fetchone()\n assert counts[0] == 3\n\n source = SlingSourceConnection(\n type="file",\n )\n target = SlingTargetConnection(type="sqlite", instance=sqllitepath)\n\n materialize(\n [run_sync],\n resources={\n "sling": SlingResource(\n source_connection=source,\n target_connection=target,\n mode=SlingMode.TRUNCATE,\n )\n },\n )\n\n """\n yield from self._sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n )
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.resources"}}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.asset_defs

\nimport hashlib\nimport inspect\nimport re\nfrom functools import partial\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    OpExecutionContext,\n    Output,\n    _check as check,\n    multi_asset,\n)\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterStepOutputNotFoundError\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.utils import (\n    generate_materializations,\n    get_fivetran_connector_url,\n    metadata_for_table,\n)\n\n\ndef _build_fivetran_assets(\n    connector_id: str,\n    destination_tables: Sequence[str],\n    poll_interval: float = DEFAULT_POLL_INTERVAL,\n    poll_timeout: Optional[float] = None,\n    io_manager_key: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n    table_to_asset_key_map: Optional[Mapping[str, AssetKey]] = None,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n    group_name: Optional[str] = None,\n    infer_missing_tables: bool = False,\n    op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n    asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n    tracked_asset_keys = {\n        table: AssetKey([*asset_key_prefix, *table.split(".")]) for table in destination_tables\n    }\n    user_facing_asset_keys = table_to_asset_key_map or tracked_asset_keys\n\n    _metadata_by_table_name = check.opt_mapping_param(\n        metadata_by_table_name, "metadata_by_table_name", key_type=str\n    )\n\n    @multi_asset(\n        name=f"fivetran_sync_{connector_id}",\n        outs={\n            "_".join(key.path): AssetOut(\n                io_manager_key=io_manager_key,\n                key=user_facing_asset_keys[table],\n                metadata=_metadata_by_table_name.get(table),\n                dagster_type=Nothing,\n            )\n            for table, key in tracked_asset_keys.items()\n        },\n        compute_kind="fivetran",\n        resource_defs=resource_defs,\n        group_name=group_name,\n        op_tags=op_tags,\n    )\n    def _assets(context: OpExecutionContext, fivetran: FivetranResource) -> Any:\n        fivetran_output = fivetran.sync_and_poll(\n            connector_id=connector_id,\n            poll_interval=poll_interval,\n            poll_timeout=poll_timeout,\n        )\n\n        materialized_asset_keys = set()\n        for materialization in generate_materializations(\n            fivetran_output, asset_key_prefix=asset_key_prefix\n        ):\n            # scan through all tables actually created, if it was expected then emit an Output.\n            # otherwise, emit a runtime AssetMaterialization\n            if materialization.asset_key in tracked_asset_keys.values():\n                yield Output(\n                    value=None,\n                    output_name="_".join(materialization.asset_key.path),\n                    metadata=materialization.metadata,\n                )\n                materialized_asset_keys.add(materialization.asset_key)\n\n            else:\n                yield materialization\n\n        unmaterialized_asset_keys = set(tracked_asset_keys.values()) - materialized_asset_keys\n        if infer_missing_tables:\n            for asset_key in unmaterialized_asset_keys:\n                yield Output(\n                    value=None,\n                    output_name="_".join(asset_key.path),\n                )\n\n        else:\n            if unmaterialized_asset_keys:\n                asset_key = next(iter(unmaterialized_asset_keys))\n                output_name = "_".join(asset_key.path)\n                raise DagsterStepOutputNotFoundError(\n                    f"Core compute for {context.op_def.name} did not return an output for"\n                    f' non-optional output "{output_name}".',\n                    step_key=context.get_step_execution_context().step.key,\n                    output_name=output_name,\n                )\n\n    return [_assets]\n\n\n
[docs]def build_fivetran_assets(\n connector_id: str,\n destination_tables: Sequence[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[Sequence[str]] = None,\n metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n group_name: Optional[str] = None,\n infer_missing_tables: bool = False,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n """Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefinition which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]): A mapping from destination\n table name to user-supplied metadata that should be associated with the asset for that table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n infer_missing_tables (bool): If True, will create asset materializations for tables specified\n in destination_tables even if they are not present in the Fivetran sync output. This is useful\n in cases where Fivetran does not sync any data for a table and therefore does not include it\n in the sync output API response.\n op_tags (Optional[Dict[str, Any]]):\n A dictionary of tags for the op that computes the asset. Frameworks may expect and\n require certain metadata to be attached to a op. Values that are not strings will be\n json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.\n\n **Examples:**\n\n Basic example:\n\n .. code-block:: python\n\n from dagster import AssetKey, repository, with_resources\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n Attaching metadata:\n\n .. code-block:: python\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n metadata_by_table_name={\n "schema1.table1": {\n "description": "This is a table that contains foo and bar",\n },\n "schema2.table2": {\n "description": "This is a table that contains baz and quux",\n },\n },\n )\n """\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=destination_tables,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n io_manager_key=io_manager_key,\n asset_key_prefix=asset_key_prefix,\n metadata_by_table_name=metadata_by_table_name,\n group_name=group_name,\n infer_missing_tables=infer_missing_tables,\n op_tags=op_tags,\n )
\n\n\nclass FivetranConnectionMetadata(\n NamedTuple(\n "_FivetranConnectionMetadata",\n [\n ("name", str),\n ("connector_id", str),\n ("connector_url", str),\n ("schemas", Mapping[str, Any]),\n ],\n )\n):\n def build_asset_defn_metadata(\n self,\n key_prefix: Sequence[str],\n group_name: Optional[str],\n table_to_asset_key_fn: Callable[[str], AssetKey],\n io_manager_key: Optional[str] = None,\n ) -> AssetsDefinitionCacheableData:\n schema_table_meta: Dict[str, MetadataUserInput] = {}\n if "schemas" in self.schemas:\n schemas_inner = cast(Dict[str, Any], self.schemas["schemas"])\n for schema in schemas_inner.values():\n if schema["enabled"]:\n schema_name = schema["name_in_destination"]\n schema_tables = cast(Dict[str, Dict[str, Any]], schema["tables"])\n for table in schema_tables.values():\n if table["enabled"]:\n table_name = table["name_in_destination"]\n schema_table_meta[f"{schema_name}.{table_name}"] = metadata_for_table(\n table, self.connector_url\n )\n else:\n schema_table_meta[self.name] = {}\n\n outputs = {\n table: AssetKey([*key_prefix, *list(table_to_asset_key_fn(table).path)])\n for table in schema_table_meta.keys()\n }\n\n internal_deps: Dict[str, Set[AssetKey]] = {}\n\n return AssetsDefinitionCacheableData(\n keys_by_input_name={},\n keys_by_output_name=outputs,\n internal_asset_deps=internal_deps,\n group_name=group_name,\n key_prefix=key_prefix,\n can_subset=False,\n metadata_by_output_name=schema_table_meta,\n extra_metadata={\n "connector_id": self.connector_id,\n "io_manager_key": io_manager_key,\n },\n )\n\n\ndef _build_fivetran_assets_from_metadata(\n assets_defn_meta: AssetsDefinitionCacheableData,\n resource_defs: Mapping[str, ResourceDefinition],\n poll_interval: float,\n poll_timeout: Optional[float] = None,\n) -> AssetsDefinition:\n metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n connector_id = cast(str, metadata["connector_id"])\n io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=list(\n assets_defn_meta.keys_by_output_name.keys()\n if assets_defn_meta.keys_by_output_name\n else []\n ),\n asset_key_prefix=list(assets_defn_meta.key_prefix or []),\n metadata_by_table_name=cast(\n Dict[str, MetadataUserInput], assets_defn_meta.metadata_by_output_name\n ),\n io_manager_key=io_manager_key,\n table_to_asset_key_map=assets_defn_meta.keys_by_output_name,\n resource_defs=resource_defs,\n group_name=assets_defn_meta.group_name,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )[0]\n\n\nclass FivetranInstanceCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n fivetran_resource_def: Union[FivetranResource, ResourceDefinition],\n key_prefix: Sequence[str],\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]],\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connector_to_asset_key_fn: Optional[Callable[[FivetranConnectionMetadata, str], AssetKey]],\n poll_interval: float,\n poll_timeout: Optional[float],\n ):\n self._fivetran_resource_def = fivetran_resource_def\n self._fivetran_instance: FivetranResource = (\n fivetran_resource_def.process_config_and_initialize()\n if isinstance(fivetran_resource_def, FivetranResource)\n else fivetran_resource_def(build_init_resource_context())\n )\n\n self._key_prefix = key_prefix\n self._connector_to_group_fn = connector_to_group_fn\n self._connection_filter = connector_filter\n self._connector_to_io_manager_key_fn = connector_to_io_manager_key_fn\n self._connector_to_asset_key_fn: Callable[[FivetranConnectionMetadata, str], AssetKey] = (\n connector_to_asset_key_fn or (lambda _, table: AssetKey(path=table.split(".")))\n )\n self._poll_interval = poll_interval\n self._poll_timeout = poll_timeout\n\n contents = hashlib.sha1()\n contents.update(",".join(key_prefix).encode("utf-8"))\n if connector_filter:\n contents.update(inspect.getsource(connector_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"fivetran-{contents.hexdigest()}")\n\n def _get_connectors(self) -> Sequence[FivetranConnectionMetadata]:\n output_connectors: List[FivetranConnectionMetadata] = []\n\n groups = self._fivetran_instance.make_request("GET", "groups")["items"]\n\n for group in groups:\n group_id = group["id"]\n\n connectors = self._fivetran_instance.make_request(\n "GET", f"groups/{group_id}/connectors"\n )["items"]\n for connector in connectors:\n connector_id = connector["id"]\n\n connector_name = connector["schema"]\n\n setup_state = connector.get("status", {}).get("setup_state")\n if setup_state and setup_state in ("incomplete", "broken"):\n continue\n\n connector_url = get_fivetran_connector_url(connector)\n\n schemas = self._fivetran_instance.make_request(\n "GET", f"connectors/{connector_id}/schemas"\n )\n\n output_connectors.append(\n FivetranConnectionMetadata(\n name=connector_name,\n connector_id=connector_id,\n connector_url=connector_url,\n schemas=schemas,\n )\n )\n\n return output_connectors\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connector in self._get_connectors():\n if not self._connection_filter or self._connection_filter(connector):\n table_to_asset_key = partial(self._connector_to_asset_key_fn, connector)\n asset_defn_data.append(\n connector.build_asset_defn_metadata(\n key_prefix=self._key_prefix,\n group_name=(\n self._connector_to_group_fn(connector.name)\n if self._connector_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connector_to_io_manager_key_fn(connector.name)\n if self._connector_to_io_manager_key_fn\n else None\n ),\n table_to_asset_key_fn=table_to_asset_key,\n )\n )\n\n return asset_defn_data\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return [\n _build_fivetran_assets_from_metadata(\n meta,\n {"fivetran": self._fivetran_instance.get_resource_definition()},\n poll_interval=self._poll_interval,\n poll_timeout=self._poll_timeout,\n )\n for meta in data\n ]\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\n
[docs]def load_assets_from_fivetran_instance(\n fivetran: Union[FivetranResource, ResourceDefinition],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]] = None,\n connector_to_asset_key_fn: Optional[\n Callable[[FivetranConnectionMetadata, str], AssetKey]\n ] = None,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n) -> CacheableAssetsDefinition:\n """Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\n about defined connectors at initialization time, and will error on workspace load if the Fivetran\n instance is not reachable.\n\n Args:\n fivetran (ResourceDefinition): A FivetranResource configured with the appropriate connection\n details.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n connector_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Fivetran connector name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.\n connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]): Optional function which takes\n in connector metadata and returns False if the connector should be excluded from the output assets.\n connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]): Optional function\n which takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\n a function that generates an AssetKey matching the table name, split by ".".\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n\n **Examples:**\n\n Loading all Fivetran connectors as assets:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n\n Filtering the set of loaded connectors:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(\n fivetran_instance,\n connector_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connector_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connector_to_io_manager_key_fn",\n )\n if not connector_to_io_manager_key_fn:\n connector_to_io_manager_key_fn = lambda _: io_manager_key\n\n return FivetranInstanceCacheableAssetsDefinition(\n fivetran_resource_def=fivetran,\n key_prefix=key_prefix,\n connector_to_group_fn=connector_to_group_fn,\n connector_to_io_manager_key_fn=connector_to_io_manager_key_fn,\n connector_filter=connector_filter,\n connector_to_asset_key_fn=connector_to_asset_key_fn,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import (\n    AssetKey,\n    Config,\n    In,\n    Nothing,\n    Out,\n    Output,\n    op,\n)\nfrom pydantic import Field\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\n\nclass SyncConfig(Config):\n    connector_id: str = Field(\n        description=(\n            "The Fivetran Connector ID that this op will sync. You can retrieve this "\n            'value from the "Setup" tab of a given connector in the Fivetran UI.'\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the Fivetran sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["fivetran"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " sync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(config: SyncConfig, fivetran: FivetranResource) -> Any:\n """Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.sync_and_poll(\n connector_id=config.connector_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(fivetran_output)
\n\n\nclass FivetranResyncConfig(SyncConfig):\n resync_parameters: Optional[Dict[str, Any]] = Field(\n None,\n description=(\n "Optional resync parameters to send in the payload to the Fivetran API. You can"\n " find an example resync payload here:"\n " https://fivetran.com/docs/rest-api/connectors#request_7"\n ),\n )\n\n\n@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " resync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(\n config: FivetranResyncConfig,\n fivetran: FivetranResource,\n) -> Any:\n """Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.resync_and_poll(\n connector_id=config.connector_id,\n resync_parameters=config.resync_parameters,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n asset_key_filter = (\n [\n AssetKey(config.asset_key_prefix + [schema, table])\n for schema, tables in config.resync_parameters.items()\n for table in tables\n ]\n if config.resync_parameters is not None\n else None\n )\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n ):\n if asset_key_filter is None or mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional, Sequence, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster import (\n    Failure,\n    InitResourceContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dateutil import parser\nfrom pydantic import Field\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_API_VERSION_PATH = "v1/"\nFIVETRAN_CONNECTOR_PATH = "connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource(ConfigurableResource):\n """This class exposes methods on top of the Fivetran REST API."""\n\n api_key: str = Field(description="The Fivetran API key to use for this resource.")\n api_secret: str = Field(description="The Fivetran API secret to use for this resource.")\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the Fivetran API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n def _auth(self) -> HTTPBasicAuth:\n return HTTPBasicAuth(self.api_key, self.api_secret)\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_API_VERSION_PATH)\n\n @property\n def api_connector_url(self) -> str:\n return urljoin(self.api_base_url, FIVETRAN_CONNECTOR_PATH)\n\n def make_connector_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n return self.make_request(method, urljoin(FIVETRAN_CONNECTOR_PATH, endpoint), data)\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n url = urljoin(self.api_base_url, endpoint)\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def get_connector_details(self, connector_id: str) -> Mapping[str, Any]:\n """Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_connector_request(method="GET", endpoint=connector_id)\n\n def _assert_syncable_connector(self, connector_id: str):\n """Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure(f"Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure(f"Connector '{connector_id}' cannot be synced as it has not been setup")\n\n def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )\n\n def update_connector(\n self, connector_id: str, properties: Optional[Mapping[str, Any]] = None\n ) -> Mapping[str, Any]:\n """Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_connector_request(\n method="PATCH", endpoint=connector_id, data=json.dumps(properties)\n )\n\n def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed(f"schedule_type must be either 'auto' or 'manual': got '{schedule_type}'")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})\n\n def get_connector_schema_config(self, connector_id: str) -> Mapping[str, Any]:\n return self.make_connector_request("GET", endpoint=f"{connector_id}/schemas")\n\n def start_sync(self, connector_id: str) -> Mapping[str, Any]:\n """Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def start_resync(\n self, connector_id: str, resync_parameters: Optional[Mapping[str, Sequence[str]]] = None\n ) -> Mapping[str, Any]:\n """Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Optional[Dict[str, List[str]]]): Optional resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_7\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(\n method="POST",\n endpoint=(\n f"{connector_id}/schemas/tables/resync"\n if resync_parameters is not None\n else f"{connector_id}/resync"\n ),\n data=json.dumps(resync_parameters) if resync_parameters is not None else None,\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran"\n " UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after "\n f"{datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details\n\n def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)\n\n def resync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n resync_parameters: Optional[Mapping[str, Sequence[str]]] = None,\n ) -> FivetranOutput:\n """Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=FivetranResource.to_config_schema())\ndef fivetran_resource(context: InitResourceContext) -> FivetranResource:\n """This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource.from_resource_context(context)
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Generator, Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._annotations import experimental\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n    TimeWindow,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom google.api_core.exceptions import NotFound\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\nBIGQUERY_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]@experimental\ndef build_bigquery_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\n If only one DbTypeHandler is provided, it will be used as the default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import build_bigquery_io_manager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n bigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a `schema` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster willstore this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=BigQueryIOManager.to_config_schema())\n def bigquery_io_manager(init_context):\n """I/O Manager for storing outputs in a BigQuery database.\n\n Assets will be stored in the dataset and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the dataset specified by output metadata (defaults to public) in a\n table of the name of the output.\n\n Note that the BigQuery config is mapped to the DB IO manager table hierarchy as follows:\n BigQuery DB IO\n * project -> database\n * dataset -> schema\n * table -> table\n """\n mgr = DbIOManager(\n type_handlers=type_handlers,\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=init_context.resource_config["project"],\n schema=init_context.resource_config.get("dataset"),\n default_load_type=default_load_type,\n )\n if init_context.resource_config.get("gcp_credentials"):\n with setup_gcp_creds(init_context.resource_config.get("gcp_credentials")):\n yield mgr\n else:\n yield mgr\n\n return bigquery_io_manager
\n\n\n
[docs]class BigQueryIOManager(ConfigurableIOManagerFactory):\n """Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a ``schema`` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster will store this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n project: str = Field(description="The GCP project to use.")\n dataset: Optional[str] = Field(\n default=None,\n description=(\n "Name of the BigQuery dataset to use. If not provided, the last prefix before"\n " the asset name will be used."\n ),\n )\n location: Optional[str] = Field(\n default=None,\n description=(\n "The GCP location. Note: When using PySpark DataFrames, the default"\n " location of the project will be used. A custom location can be specified in"\n " your SparkSession configuration."\n ),\n )\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n temporary_gcs_bucket: Optional[str] = Field(\n default=None,\n description=(\n "When using PySpark DataFrames, optionally specify a temporary GCS bucket to"\n " store data. If not provided, data will be directly written to BigQuery."\n ),\n )\n timeout: Optional[float] = Field(\n default=None,\n description=(\n "When using Pandas DataFrames, optionally specify a timeout for the BigQuery"\n " queries (loading and reading from tables)."\n ),\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> Generator:\n mgr = DbIOManager(\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=self.project,\n schema=self.dataset,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield mgr\n else:\n yield mgr
\n\n\nclass BigQueryClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.query(_get_cleanup_statement(table_slice)).result()\n except NotFound:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"""\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.query(f"CREATE SCHEMA IF NOT EXISTS {table_slice.schema}").result()\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = bigquery.Client(\n project=context.resource_config.get("project"),\n location=context.resource_config.get("location"),\n )\n\n yield conn\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"TRUNCATE TABLE `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_gcp/bigquery/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster import (\n    In,\n    List,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.encryption_configuration import EncryptionConfiguration\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import TimePartitioning\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n m = hashlib.sha1()\n for query in sql_queries:\n m.update(query.encode("utf-8"))\n hash_str = m.hexdigest()[:10]\n name = f"bq_op_{hash_str}"\n\n @op(\n name=name,\n ins={_START: In(Nothing)},\n out=Out(List[DataFrame]),\n config_schema=define_bigquery_query_config(),\n required_resource_keys={"bigquery"},\n tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n )\n def _bq_fn(context):\n query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n # Retrieve results as pandas DataFrames\n results = []\n for sql_query in sql_queries:\n # We need to construct a new QueryJobConfig for each query.\n # See: https://bit.ly/2VjD6sl\n cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n context.log.info(\n "executing query %s with config: %s"\n % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n )\n results.append(\n context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n )\n\n return results\n\n return _bq_fn
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n ins={"paths": In(List[str])},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n ins={"df": In(DataFrame)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n ins={"path": In(str)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\n\n
[docs]class BigQueryResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "bigquery": BigQueryResource(project="my-project")\n }\n )\n """\n\n project: Optional[str] = Field(\n default=None,\n description=(\n "Project ID for the project which the client acts on behalf of. Will be passed when"\n " creating a dataset / job. If not passed, falls back to the default inferred from the"\n " environment."\n ),\n )\n\n location: Optional[str] = Field(\n default=None,\n description="Default location for jobs / datasets / tables.",\n )\n\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_client(self) -> Iterator[bigquery.Client]:\n """Context manager to create a BigQuery Client.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n """\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield bigquery.Client(project=self.project, location=self.location)\n\n else:\n yield bigquery.Client(project=self.project, location=self.location)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n with self.get_client() as client:\n yield client
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=BigQueryResource.to_config_schema(),\n description="Dagster resource for connecting to BigQuery",\n)\ndef bigquery_resource(context):\n bq_resource = BigQueryResource.from_resource_context(context)\n with bq_resource.get_client() as client:\n yield client
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\nfrom dagster._config import ConfigScalar, ConfigScalarKind, PostProcessingError\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset"."""\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix.\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                "with optional date-partition suffix"\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom typing import Any, Dict\n\nfrom dagster import (\n    Bool,\n    Config,\n    Field as DagsterField,\n    Int,\n    op,\n)\nfrom dagster._seven import json\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_submit_job_config\nfrom .resources import TWENTY_MINUTES, DataprocResource\n\n# maintain the old config schema because of the nested job_config schema\nDATAPROC_CONFIG_SCHEMA = {\n    "job_timeout_in_seconds": DagsterField(\n        Int,\n        description="""Optional. Maximum time in seconds to wait for the job being\n                    completed. Default is set to 1200 seconds (20 minutes).\n                    """,\n        is_required=False,\n        default_value=TWENTY_MINUTES,\n    ),\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": DagsterField(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\nclass DataprocOpConfig(Config):\n    job_timeout_in_seconds: int = Field(\n        default=TWENTY_MINUTES,\n        description=(\n            "Maximum time in seconds to wait for the job being completed. Default is set to 1200"\n            " seconds (20 minutes)."\n        ),\n    )\n    job_scoped_cluster: bool = Field(\n        default=True,\n        description="Whether to create a cluster or use an existing cluster. Defaults to True.",\n    )\n    project_id: str = Field(\n        description=(\n            "Required. Project ID for the project which the client acts on behalf of. Will be"\n            " passed when creating a dataset/job."\n        )\n    )\n    region: str = Field(description="The GCP region.")\n    job_config: Dict[str, Any] = Field(\n        description="Python dictionary containing configuration for the Dataproc Job."\n    )\n\n\ndef _dataproc_compute(context):\n    job_config = context.op_config["job_config"]\n    job_timeout = context.op_config["job_timeout_in_seconds"]\n\n    context.log.info(\n        "submitting job with config: %s and timeout of: %d seconds"\n        % (str(json.dumps(job_config)), job_timeout)\n    )\n\n    if context.op_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info(f"Submitted job ID {job_id}")\n            cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info(f"Submitted job ID {job_id}")\n        context.resources.dataproc.wait_for_job(job_id, wait_timeout=job_timeout)\n\n\n@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n    return _dataproc_compute(context)\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n\n\n@op\ndef configurable_dataproc_op(context, dataproc: DataprocResource, config: DataprocOpConfig):\n job_config = {"projectId": config.project_id, "region": config.region, "job": config.job_config}\n job_timeout = config.job_timeout_in_seconds\n\n context.log.info(\n "submitting job with config: %s and timeout of: %d seconds"\n % (str(json.dumps(job_config)), job_timeout)\n )\n\n dataproc_client = dataproc.get_client()\n\n if config.job_scoped_cluster:\n # Cluster context manager, creates and then deletes cluster\n with dataproc_client.cluster_context_manager() as cluster:\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = cluster.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n else:\n # Submit to an existing cluster\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = dataproc_client.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n dataproc_client.wait_for_job(job_id, wait_timeout=job_timeout)\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport json\nimport time\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Mapping, Optional\n\nimport dagster._check as check\nimport yaml\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocClient:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id, wait_timeout=TWENTY_MINUTES):\n        """This method polls job status every 5 seconds."""\n\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn, max_wait_time_sec=wait_timeout)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true."""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """Context manager allowing execution with a dataproc cluster.\n\n        Example:\n        .. code-block::\n            with context.resources.dataproc.cluster as cluster:\n                # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]class DataprocResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for connecting to a Dataproc cluster.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(dataproc: DataprocResource):\n with dataproc.get_client() as client:\n # client is a dagster_gcp.DataprocClient\n ...\n """\n\n project_id: str = Field(\n description=(\n "Required. Project ID for the project which the client acts on behalf of. Will be"\n " passed when creating a dataset/job."\n )\n )\n region: str = Field(description="The GCP region.")\n cluster_name: str = Field(\n description=(\n "Required. The cluster name. Cluster names within a project must be unique. Names of"\n " deleted clusters can be reused."\n )\n )\n cluster_config_yaml_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a YAML file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_json_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a JSON file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_dict: Optional[Dict[str, Any]] = Field(\n default=None,\n description=(\n "Python dictionary containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _read_yaml_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return yaml.safe_load(f)\n\n def _read_json_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return json.load(f)\n\n def _get_cluster_config(self) -> Optional[Mapping[str, Any]]:\n methods = 0\n methods += 1 if self.cluster_config_dict is not None else 0\n methods += 1 if self.cluster_config_json_path is not None else 0\n methods += 1 if self.cluster_config_yaml_path is not None else 0\n\n # ensure that at most 1 method is provided\n check.invariant(\n methods <= 1,\n "Dataproc Resource: Incorrect config: Cannot provide cluster config multiple ways."\n " Choose one of cluster_config_dict, cluster_config_json_path, or"\n " cluster_config_yaml_path",\n )\n\n cluster_config = None\n if self.cluster_config_json_path:\n cluster_config = self._read_json_config(self.cluster_config_json_path)\n elif self.cluster_config_yaml_path:\n cluster_config = self._read_yaml_config(self.cluster_config_yaml_path)\n elif self.cluster_config_dict:\n cluster_config = self.cluster_config_dict\n\n return cluster_config\n\n def get_client(self) -> DataprocClient:\n cluster_config = self._get_cluster_config()\n\n client_config_dict = {\n "projectId": self.project_id,\n "region": self.region,\n "clusterName": self.cluster_name,\n "cluster_config": cluster_config,\n }\n\n return DataprocClient(config=client_config_dict)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocClient(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.compute_log_manager

\nimport datetime\nimport json\nimport os\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom google.cloud import storage\nfrom typing_extensions import Self\n\n\n
[docs]class GCSComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to GCS.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_gcp.gcs.compute_log_manager\n class: GCSComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n upload_interval: 30\n\n There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage\n\n Args:\n bucket (str): The name of the GCS bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n json_credentials_envvar (Optional[str]): Environment variable that contains the JSON with a private key\n and other credentials information. If this is set, ``GOOGLE_APPLICATION_CREDENTIALS`` will be ignored.\n Can be used when the private key cannot be used as a file.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when instantiated from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n json_credentials_envvar=None,\n upload_interval=None,\n ):\n self._bucket_name = check.str_param(bucket, "bucket")\n self._prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n if json_credentials_envvar:\n json_info_str = os.environ.get(json_credentials_envvar)\n credentials_info = json.loads(json_info_str) # type: ignore # (possible none)\n self._bucket = (\n storage.Client()\n .from_service_account_info(credentials_info)\n .bucket(self._bucket_name)\n )\n else:\n self._bucket = storage.Client().bucket(self._bucket_name)\n\n # Check if the bucket exists\n check.invariant(self._bucket.exists())\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "json_credentials_envvar": Field(StringSource, is_required=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return GCSComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _gcs_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._prefix, "storage", *namespace, filename]\n return "/".join(paths)\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self._local_manager.delete_logs(log_key, prefix)\n if log_key:\n gcs_keys_to_remove = [\n self._gcs_key(log_key, ComputeIOType.STDOUT),\n self._gcs_key(log_key, ComputeIOType.STDERR),\n self._gcs_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._gcs_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n # if the blob doesn't exist, do nothing instead of raising a not found exception\n self._bucket.delete_blobs(gcs_keys_to_remove, on_error=lambda _: None)\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n delete_prefix = "/".join([self._prefix, "storage", *prefix, ""])\n to_delete = self._bucket.list_blobs(prefix=delete_prefix)\n self._bucket.delete_blobs(list(to_delete))\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n gcs_key = self._gcs_key(log_key, io_type)\n try:\n return self._bucket.blob(gcs_key).generate_signed_url(\n expiration=datetime.timedelta(minutes=60)\n )\n except:\n # fallback to the local download url if the current credentials are insufficient to create\n # signed urls\n return self.local_manager.get_captured_log_download_url(log_key, io_type)\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n gcs_key = self._gcs_key(log_key, io_type)\n return f"gs://{self._bucket_name}/{gcs_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n gcs_key = self._gcs_key(log_key, io_type, partial)\n return self._bucket.blob(gcs_key).exists()\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if partial and os.stat(path).st_size == 0:\n return\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n self._bucket.blob(gcs_key).upload_from_file(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._bucket.blob(gcs_key).download_to_file(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_gcp/gcs/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import Optional\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\nfrom google.cloud import storage\n\n\n
[docs]class GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return f"gs://{self.gcs_bucket}/{self.gcs_key}"
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", key=key, ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(key + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return f"{self._gcs_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\nfrom typing import Any, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.backoff import backoff\nfrom dagster._utils.cached_method import cached_method\nfrom google.api_core.exceptions import Forbidden, ServiceUnavailable, TooManyRequests\nfrom google.cloud import storage\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import GCSResource\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(UPathIOManager):\n    def __init__(self, bucket: str, client: Optional[Any] = None, prefix: str = "dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n        super().__init__(base_path=UPath(self.prefix))\n\n    def unlink(self, path: UPath) -> None:\n        key = str(path)\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def path_exists(self, path: UPath) -> bool:\n        key = str(path)\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading GCS object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing GCS object at: {self._uri_for_path(path)}"\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"gs://{self.bucket}/{path}"\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in GCP\n        return None\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        bytes_obj = self.bucket_obj.blob(str(path)).download_as_bytes()\n        return pickle.loads(bytes_obj)\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing GCS key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(str(path)).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden, ServiceUnavailable),\n        )\n\n\n
[docs]class GCSPickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": GCSPickleIOManager(\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n "gcs": GCSResource(project="my-cool-project")\n }\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @job(\n resource_defs={\n "io_manager": GCSPickleIOManager(\n gcs=GCSResource(project="my-cool-project")\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n }\n )\n def my_job():\n ...\n """\n\n gcs: ResourceDependency[GCSResource]\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectGCSIOManager:\n return PickledObjectGCSIOManager(\n bucket=self.gcs_bucket, client=self.gcs.get_client(), prefix=self.gcs_prefix\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectGCSIOManager(GCSPickleIOManager):\n """Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=GCSPickleIOManager.to_config_schema(),\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @job(\n resource_defs={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n def my_job():\n ...\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n bucket=init_context.resource_config["gcs_bucket"],\n client=client,\n prefix=init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom typing import Any, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import storage\nfrom pydantic import Field\n\nfrom .file_manager import GCSFileManager\n\n\n
[docs]class GCSResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google Cloud Storage.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(gcs: GCSResource):\n with gcs.get_client() as client:\n # client is a google.cloud.storage.Client\n ...\n """\n\n project: Optional[str] = Field(default=None, description="Project name")\n\n def get_client(self) -> storage.Client:\n """Creates a GCS Client.\n\n Returns: google.cloud.storage.Client\n """\n return _gcs_client_from_config(project=self.project)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GCSResource.to_config_schema(),\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context) -> storage.Client:\n return GCSResource.from_resource_context(init_context).get_client()
\n\n\n
[docs]class GCSFileManagerResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """FileManager that provides abstract access to GCS."""\n\n project: Optional[str] = Field(default=None, description="Project name")\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n def get_client(self) -> GCSFileManager:\n """Creates a :py:class:`~dagster_gcp.GCSFileManager` object that implements the\n :py:class:`~dagster._core.storage.file_manager.FileManager` API .\n\n Returns: GCSFileManager\n """\n gcs_client = _gcs_client_from_config(project=self.project)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=self.gcs_bucket,\n gcs_base_key=self.gcs_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=GCSFileManagerResource.to_config_schema())\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return GCSFileManagerResource.from_resource_context(context).get_client()
\n\n\ndef _gcs_client_from_config(project: Optional[str]) -> storage.Client:\n """Creates a GCS Client.\n\n Args:\n project: The GCP project\n\n Returns: A GCS client.\n """\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_gcp_pandas": {"bigquery": {"bigquery_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp.bigquery.io_manager import (\n    BigQueryClient,\n    BigQueryIOManager,\n    build_bigquery_io_manager,\n)\n\n\n
[docs]class BigQueryPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in BigQuery."""\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n\n job = connection.load_table_from_dataframe(\n dataframe=with_uppercase_cols,\n destination=f"{table_slice.schema}.{table_slice.table}",\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n )\n job.result()\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = connection.query(\n query=BigQueryClient.get_select_statement(table_slice),\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n ).to_dataframe()\n\n result.columns = map(str.lower, result.columns)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nbigquery_pandas_io_manager = build_bigquery_io_manager(\n [BigQueryPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nbigquery_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pandas import bigquery_pandas_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pandas_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPandasIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pandas import BigQueryPandasIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pandas/bigquery/bigquery_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler"}}}, "dagster_gcp_pyspark": {"bigquery": {"bigquery_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler

\nfrom typing import Any, Mapping, Optional, Sequence, Type\n\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp import BigQueryIOManager, build_bigquery_io_manager\nfrom dagster_gcp.bigquery.io_manager import BigQueryClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef _get_bigquery_write_options(\n    config: Optional[Mapping[str, Any]], table_slice: TableSlice\n) -> Mapping[str, str]:\n    conf = {\n        "table": f"{table_slice.database}.{table_slice.schema}.{table_slice.table}",\n    }\n    if config and config.get("temporary_gcs_bucket") is not None:\n        conf["temporaryGcsBucket"] = config["temporary_gcs_bucket"]\n    else:\n        conf["writeMethod"] = "direct"\n    return conf\n\n\ndef _get_bigquery_read_options(table_slice: TableSlice) -> Mapping[str, str]:\n    conf = {"viewsEnabled": "true", "materializationDataset": table_slice.schema}\n    return conf\n\n\n
[docs]class BigQueryPySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_bigquery_write_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format("bigquery").options(**options).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_bigquery_read_options(table_slice)\n spark = SparkSession.builder.getOrCreate() # type: ignore\n\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format("bigquery")\n .options(**options)\n .load(BigQueryClient.get_select_statement(table_slice))\n )\n\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nbigquery_pyspark_io_manager = build_bigquery_io_manager(\n [BigQueryPySparkTypeHandler()], default_load_type=DataFrame\n)\nbigquery_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pyspark import bigquery_pyspark_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pyspark_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPySparkIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pyspark import BigQueryPySparkIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pyspark/bigquery/bigquery_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ge.factory

\nimport datetime\nfrom typing import Any, Dict\n\nimport great_expectations as ge\nfrom dagster import (\n    ConfigurableResource,\n    ExpectationResult,\n    IAttachDifferentObjectToOpContext,\n    In,\n    MetadataValue,\n    OpExecutionContext,\n    Out,\n    Output,\n    _check as check,\n    op,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\nfrom pydantic import Field\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\nclass GEContextResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n    ge_root_dir: str = Field(\n        default=None,\n        description="The root directory for your Great Expectations project.",\n    )\n\n    def get_data_context(self):\n        if self.ge_root_dir is None:\n            return ge.data_context.DataContext()\n        return ge.data_context.DataContext(context_root_dir=self.ge_root_dir)\n\n    def get_object_to_set_on_execution_context(self):\n        return self.get_data_context()\n\n\n@dagster_maintained_resource\n@resource(config_schema=GEContextResource.to_config_schema())\ndef ge_data_context(context):\n    return GEContextResource.from_resource_context(context).get_data_context()\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(suite_name, "suite_name")\n check.opt_str_param(validation_operator_name, "validation_operator_name")\n batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an expectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n if validation_operator_name is not None:\n validation_operator = validation_operator_name\n else:\n data_context.add_validation_operator(\n "ephemeral_validation",\n {"class_name": "ActionListValidationOperator", "action_list": []},\n )\n validation_operator = "ephemeral_validation"\n suite = data_context.get_expectation_suite(suite_name)\n final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n if "datasource" in final_batch_kwargs:\n context.log.warning(\n "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n "parameter of the op factory instead."\n )\n final_batch_kwargs["datasource"] = datasource_name\n batch = data_context.get_batch(final_batch_kwargs, suite)\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = data_context.run_validation_operator(\n validation_operator, assets_to_validate=[batch], run_id=run_id\n )\n res = convert_to_json_serializable(results.list_validation_results())[0]\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = (\n validation_results_page_renderer.render_validation_operator_result(results)\n )\n md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=res["success"],\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(res)\n\n return _ge_validation_fn
\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API).\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the op input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n _extra_kwargs: Dict[Any, Any] = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **_extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\nfrom typing import Optional\n\nimport jwt\nimport requests\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubClient:\n    def __init__(\n        self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None\n    ) -> None:\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            (\n                "https://api.github.com/app/installations"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/v3/app/installations"\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            (\n                f"https://api.github.com/app/installations/{installation_id}/access_tokens"\n                if self.hostname is None\n                else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                    self.hostname, installation_id\n                )\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            (\n                "https://api.github.com/graphql"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/graphql"\n            ),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]class GithubResource(ConfigurableResource):\n github_app_id: int = Field(\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n )\n github_app_private_rsa_key: str = Field(\n description=(\n "Github Application Private RSA key text, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_installation_id: Optional[int] = Field(\n default=None,\n description=(\n "Github Application Installation ID, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_hostname: Optional[str] = Field(\n default=None,\n description=(\n "Github hostname. Defaults to `api.github.com`, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> GithubClient:\n return GithubClient(\n client=requests.Session(),\n app_id=self.github_app_id,\n app_private_rsa_key=self.github_app_private_rsa_key,\n default_installation_id=self.github_installation_id,\n hostname=self.github_hostname,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GithubResource.to_config_schema(),\n description="This resource is for connecting to Github",\n)\ndef github_resource(context) -> GithubClient:\n return GithubResource(**context.resource_config).get_client()
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nimport requests.exceptions\nfrom dagster import DagsterRunStatus\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.definitions.utils import validate_tags\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n    TERMINATE_RUN_JOB_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    JobInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]class DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL.\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagster.YOUR_ORG_HERE`.\n port_number (Optional[int]): Port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n timeout (int): Number of seconds before requests should time out. Defaults to 60.\n headers (Optional[Dict[str, str]]): Additional headers to include in the request. To use\n this client in Dagster Cloud, set the "Dagster-Cloud-Api-Token" header to a user token\n generated in the Dagster Cloud UI.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n timeout: int = 300,\n headers: Optional[Dict[str, str]] = None,\n ):\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(\n url=self._url, use_json=True, timeout=timeout, headers=headers\n ),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables"\n f" \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, job_name: str) -> List[JobInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[JobInfo] = chain(*map(JobInfo.from_node, query_res["nodes"]))\n return [info for info in valid_nodes if info.job_name == job_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Union[RunConfig, Mapping[str, Any]]] = None,\n mode: str = "default",\n preset: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n job_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(job_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name"\n f" `{pipeline_name}` exist",\n )\n elif len(job_info_lst) == 1:\n job_info = job_info_lst[0]\n repository_location_name = job_info.repository_location_name\n repository_name = job_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name since there are"\n f" multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the"\n f" name {pipeline_name}.\\n\\tchoose one of: {job_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": op_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": (\n {"tags": [{"key": k, "value": v} for k, v in tags.items()]} if tags else {}\n ),\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] @public\n def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[Sequence[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n op_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] @public\n def get_run_status(self, run_id: str) -> DagsterRunStatus:\n """Get the status of a given Pipeline Run.\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n DagsterRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return DagsterRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] @public\n def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] @deprecated(breaking_version="2.0")\n @public\n def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n\n def terminate_run(self, run_id: str):\n """Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\n based on a external event.\n\n Args:\n run_id (str): The run id of the pipeline run to terminate\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n TERMINATE_RUN_JOB_MUTATION, {"runId": run_id}\n )\n\n query_result: Dict[str, Any] = res_data["terminateRun"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "TerminateRunSuccess":\n return\n\n elif query_result_type == "RunNotFoundError":\n raise DagsterGraphQLClientError("RunNotFoundError", f"Run Id {run_id} not found")\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location.\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass JobInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n job_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["JobInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n JobInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n job_name=job["name"],\n )\n for job in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.executor

\nfrom typing import Iterator, List, Optional, cast\n\nimport kubernetes.config\nfrom dagster import (\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    executor,\n)\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import (\n    CheckStepHealthResult,\n    StepDelegatingExecutor,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    USER_DEFINED_K8S_CONFIG_SCHEMA,\n    DagsterK8sJobConfig,\n    UserDefinedDagsterK8sConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\n_K8S_EXECUTOR_CONFIG_SCHEMA = merge_dicts(\n    DagsterK8sJobConfig.config_type_job(),\n    {\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            description="""Whether or not the executor is running within a k8s cluster already. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.\n            If ``True``, we assume the executor is running within the target cluster and load config\n            using ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n            specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n            back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            description="""Path to a kubeconfig file to use, if not using default kubeconfig. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.""",\n        ),\n        "job_namespace": Field(StringSource, is_required=False),\n        "retries": get_retries_config(),\n        "max_concurrent": Field(\n            IntSource,\n            is_required=False,\n            description=(\n                "Limit on the number of pods that will run concurrently within the scope "\n                "of a Dagster run. Note that this limit is per run, not global."\n            ),\n        ),\n        "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n        "step_k8s_config": Field(\n            USER_DEFINED_K8S_CONFIG_SCHEMA,\n            is_required=False,\n            description="Raw Kubernetes configuration for each step launched by the executor.",\n        ),\n    },\n)\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=_K8S_EXECUTOR_CONFIG_SCHEMA,\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n max_concurrent: ...\n\n `max_concurrent` limits the number of pods that will execute concurrently for one run. By default\n there is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\n global limit.\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n\n Configuration set using `tags` on a `@job` will only apply to the `run` level. For configuration\n to apply at each `step` it must be set using `tags` for each `@op`.\n """\n run_launcher = (\n init_context.instance.run_launcher\n if isinstance(init_context.instance.run_launcher, K8sRunLauncher)\n else None\n )\n\n exc_cfg = init_context.executor_config\n\n k8s_container_context = K8sContainerContext(\n image_pull_policy=exc_cfg.get("image_pull_policy"), # type: ignore\n image_pull_secrets=exc_cfg.get("image_pull_secrets"), # type: ignore\n service_account_name=exc_cfg.get("service_account_name"), # type: ignore\n env_config_maps=exc_cfg.get("env_config_maps"), # type: ignore\n env_secrets=exc_cfg.get("env_secrets"), # type: ignore\n env_vars=exc_cfg.get("env_vars"), # type: ignore\n volume_mounts=exc_cfg.get("volume_mounts"), # type: ignore\n volumes=exc_cfg.get("volumes"), # type: ignore\n labels=exc_cfg.get("labels"), # type: ignore\n namespace=exc_cfg.get("job_namespace"), # type: ignore\n resources=exc_cfg.get("resources"), # type: ignore\n scheduler_name=exc_cfg.get("scheduler_name"), # type: ignore\n # step_k8s_config feeds into the run_k8s_config field because it is merged\n # with any configuration for the run that was set on the run launcher or code location\n run_k8s_config=UserDefinedDagsterK8sConfig.from_dict(exc_cfg.get("step_k8s_config", {})),\n )\n\n if "load_incluster_config" in exc_cfg:\n load_incluster_config = cast(bool, exc_cfg["load_incluster_config"])\n else:\n load_incluster_config = run_launcher.load_incluster_config if run_launcher else True\n\n if "kubeconfig_file" in exc_cfg:\n kubeconfig_file = cast(Optional[str], exc_cfg["kubeconfig_file"])\n else:\n kubeconfig_file = run_launcher.kubeconfig_file if run_launcher else None\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n image=exc_cfg.get("job_image"), # type: ignore\n container_context=k8s_container_context,\n load_incluster_config=load_incluster_config,\n kubeconfig_file=kubeconfig_file,\n ),\n retries=RetryMode.from_config(exc_cfg["retries"]), # type: ignore\n max_concurrent=check.opt_int_elem(exc_cfg, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(exc_cfg, "tag_concurrency_limits"),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n image: Optional[str],\n container_context: K8sContainerContext,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._executor_image = check.opt_str_param(image, "image")\n self._executor_container_context = check.inst_param(\n container_context, "container_context", K8sContainerContext\n )\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n def _get_step_key(self, step_handler_context: StepHandlerContext) -> str:\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n return step_keys_to_execute[0]\n\n def _get_container_context(\n self, step_handler_context: StepHandlerContext\n ) -> K8sContainerContext:\n step_key = self._get_step_key(step_handler_context)\n\n context = K8sContainerContext.create_for_run(\n step_handler_context.dagster_run,\n cast(K8sRunLauncher, step_handler_context.instance.run_launcher),\n include_run_tags=False, # For now don't include job-level dagster-k8s/config tags in step pods\n )\n context = context.merge(self._executor_container_context)\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n step_handler_context.step_tags[step_key]\n )\n return context.merge(K8sContainerContext(run_k8s_config=user_defined_k8s_config))\n\n def _get_k8s_step_job_name(self, step_handler_context: StepHandlerContext):\n step_key = self._get_step_key(step_handler_context)\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n container_context = self._get_container_context(step_handler_context)\n\n job_config = container_context.get_k8s_job_config(\n self._executor_image, step_handler_context.instance.run_launcher\n )\n\n args = step_handler_context.execute_step_args.get_command_args(\n skip_serialized_namedtuple=True\n )\n\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.job_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n run = step_handler_context.dagster_run\n labels = {\n "dagster/job": run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=container_context.run_k8s_config,\n labels=labels,\n env_vars=[\n *step_handler_context.execute_step_args.get_command_env(),\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n *container_context.env,\n ],\n )\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message=f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n metadata={\n "Kubernetes Job name": MetadataValue.text(job_name),\n },\n )\n\n namespace = check.not_none(container_context.namespace)\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n container_context = self._get_container_context(step_handler_context)\n\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n if status.failed:\n return CheckStepHealthResult.unhealthy(\n reason=f"Discovered failed Kubernetes job {job_name} for step {step_key}.",\n )\n\n return CheckStepHealthResult.healthy()\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n container_context = self._get_container_context(step_handler_context)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Deleting Kubernetes job {job_name} for step",\n event_specific_data=EngineEventData(),\n )\n\n self._api_client.delete_job(job_name=job_name, namespace=container_context.namespace)\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.launcher

\nimport logging\nimport sys\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport kubernetes\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._cli.api import ExecuteRunArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._grpc.types import ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_job_name_from_run_id\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data: Optional[ConfigurableClassData] = None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n resources=None,\n scheduler_name=None,\n security_context=None,\n run_k8s_config=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels: Mapping[str, str] = check.opt_mapping_param(\n labels, "labels", key_type=str, value_type=str\n )\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self._resources: Mapping[str, Any] = check.opt_mapping_param(resources, "resources")\n self._scheduler_name = check.opt_str_param(scheduler_name, "scheduler_name")\n self._security_context = check.opt_dict_param(security_context, "security_context")\n self._run_k8s_config = check.opt_dict_param(run_k8s_config, "run_k8s_config")\n super().__init__()\n\n @property\n def job_image(self):\n return self._job_image\n\n @property\n def image_pull_policy(self) -> str:\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self) -> Sequence[Mapping]:\n return self._image_pull_secrets\n\n @property\n def service_account_name(self) -> str:\n return self._service_account_name\n\n @property\n def env_config_maps(self) -> Sequence[str]:\n return self._env_config_maps\n\n @property\n def env_secrets(self) -> Sequence[str]:\n return self._env_secrets\n\n @property\n def volume_mounts(self) -> Sequence:\n return self._volume_mounts\n\n @property\n def volumes(self) -> Sequence:\n return self._volumes\n\n @property\n def resources(self) -> Mapping:\n return self._resources\n\n @property\n def scheduler_name(self) -> Optional[str]:\n return self._scheduler_name\n\n @property\n def security_context(self) -> Mapping[str, Any]:\n return self._security_context\n\n @property\n def env_vars(self) -> Sequence[str]:\n return self._env_vars\n\n @property\n def labels(self) -> Mapping[str, str]:\n return self._labels\n\n @property\n def run_k8s_config(self) -> Mapping[str, str]:\n return self._run_k8s_config\n\n @property\n def fail_pod_on_run_failure(self) -> Optional[bool]:\n return self._fail_pod_on_run_failure\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n return DagsterK8sJobConfig.config_type_run_launcher()\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_container_context_for_run(self, dagster_run: DagsterRun) -> K8sContainerContext:\n return K8sContainerContext.create_for_run(dagster_run, self, include_run_tags=True)\n\n def _launch_k8s_job_with_args(\n self, job_name: str, args: Optional[Sequence[str]], run: DagsterRun\n ) -> None:\n container_context = self.get_container_context_for_run(run)\n\n pod_name = job_name\n\n job_origin = check.not_none(run.job_code_origin)\n user_defined_k8s_config = container_context.run_k8s_config\n repository_origin = job_origin.repository_origin\n\n job_config = container_context.get_k8s_job_config(\n job_image=repository_origin.container_image, run_launcher=self\n )\n job_image = job_config.job_image\n if job_image: # expected to be set\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_image},\n )\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": job_origin.job_name,\n },\n *container_context.env,\n ],\n )\n\n namespace = check.not_none(container_context.namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(run.run_id)\n job_origin = check.not_none(run.job_code_origin)\n\n args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n job_origin = check.not_none(run.job_code_origin)\n\n args = ResumeRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container_context = self.get_container_context_for_run(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=container_context.namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def supports_run_worker_crash_recovery(self):\n return True\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n container_context = self.get_container_context_for_run(run)\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n namespace = container_context.namespace\n user_defined_k8s_config = container_context.run_k8s_config\n container_name = user_defined_k8s_config.container_config.get("name", "dagster")\n pod_names = self._api_client.get_pod_names_in_job(job_name, namespace=namespace)\n full_msg = ""\n try:\n pod_debug_info = [\n self._api_client.get_pod_debug_info(\n pod_name, namespace, container_name=container_name\n )\n for pod_name in pod_names\n ]\n full_msg = "\\n".join(pod_debug_info)\n except Exception:\n logging.exception(\n f"Error trying to get debug information for failed k8s job {job_name}"\n )\n if pod_names:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe pod"\n f" {pod_names[0]}`, `kubectl logs {pod_names[0]}`, or `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n else:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n return full_msg\n\n def check_run_worker_health(self, run: DagsterRun):\n container_context = self.get_container_context_for_run(run)\n\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n try:\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n\n inactive_job_with_finished_pods = bool(\n (not status.active) and (status.failed or status.succeeded)\n )\n\n # If the run is in a non-terminal (and non-STARTING) state but the k8s job is not active,\n # something went wrong\n if (\n run.status in (DagsterRunStatus.STARTED, DagsterRunStatus.CANCELING)\n and inactive_job_with_finished_pods\n ):\n return CheckRunHealthResult(\n WorkerStatus.FAILED, "Run has not completed but K8s job has no active pods"\n )\n\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.launcher"}, "ops": {"k8s_job_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.ops.k8s_job_op

\nimport time\nfrom typing import Any, Dict, List, Optional\n\nimport kubernetes.config\nimport kubernetes.watch\nfrom dagster import (\n    Enum as DagsterEnum,\n    Field,\n    In,\n    Noneable,\n    Nothing,\n    OpExecutionContext,\n    Permissive,\n    StringSource,\n    op,\n)\nfrom dagster._annotations import experimental\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..client import DEFAULT_JOB_POD_COUNT, DagsterKubernetesClient\nfrom ..container_context import K8sContainerContext\nfrom ..job import (\n    DagsterK8sJobConfig,\n    K8sConfigMergeBehavior,\n    UserDefinedDagsterK8sConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n)\nfrom ..launcher import K8sRunLauncher\n\nK8S_JOB_OP_CONFIG = merge_dicts(\n    DagsterK8sJobConfig.config_type_container(),\n    {\n        "image": Field(\n            StringSource,\n            is_required=True,\n            description="The image in which to launch the k8s job.",\n        ),\n        "command": Field(\n            [str],\n            is_required=False,\n            description="The command to run in the container within the launched k8s job.",\n        ),\n        "args": Field(\n            [str],\n            is_required=False,\n            description="The args for the command for the container.",\n        ),\n        "namespace": Field(StringSource, is_required=False),\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            default_value=True,\n            description="""Set this value if you are running the launcher\n            within a k8s cluster. If ``True``, we assume the launcher is running within the target\n            cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n            we will use the k8s config specified in ``kubeconfig_file`` (using\n            ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            default_value=None,\n            description=(\n                "The kubeconfig file from which to load config. Defaults to using the default"\n                " kubeconfig."\n            ),\n        ),\n        "timeout": Field(\n            int,\n            is_required=False,\n            description="How long to wait for the job to succeed before raising an exception",\n        ),\n        "container_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's main container"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_template_spec_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's pod spec"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's job spec"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "merge_behavior": Field(\n            DagsterEnum.from_python_enum(K8sConfigMergeBehavior),\n            is_required=False,\n            default_value=K8sConfigMergeBehavior.SHALLOW.value,\n            description=(\n                "How raw k8s config set on this op should be merged with any raw k8s config set on"\n                " the code location that launched the op. By default, the value is SHALLOW, meaning"\n                " that the two dictionaries are shallowly merged - any shared values in the "\n                " dictionaries will be replaced by the values set on this op. Setting it to DEEP"\n                " will recursively merge the two dictionaries, appending list fields together and"\n                " merging dictionary fields."\n            ),\n        ),\n    },\n)\n\n\n
[docs]@experimental\ndef execute_k8s_job(\n context: OpExecutionContext,\n image: str,\n command: Optional[List[str]] = None,\n args: Optional[List[str]] = None,\n namespace: Optional[str] = None,\n image_pull_policy: Optional[str] = None,\n image_pull_secrets: Optional[List[Dict[str, str]]] = None,\n service_account_name: Optional[str] = None,\n env_config_maps: Optional[List[str]] = None,\n env_secrets: Optional[List[str]] = None,\n env_vars: Optional[List[str]] = None,\n volume_mounts: Optional[List[Dict[str, Any]]] = None,\n volumes: Optional[List[Dict[str, Any]]] = None,\n labels: Optional[Dict[str, str]] = None,\n resources: Optional[Dict[str, Any]] = None,\n scheduler_name: Optional[str] = None,\n load_incluster_config: bool = True,\n kubeconfig_file: Optional[str] = None,\n timeout: Optional[int] = None,\n container_config: Optional[Dict[str, Any]] = None,\n pod_template_spec_metadata: Optional[Dict[str, Any]] = None,\n pod_spec_config: Optional[Dict[str, Any]] = None,\n job_metadata: Optional[Dict[str, Any]] = None,\n job_spec_config: Optional[Dict[str, Any]] = None,\n k8s_job_name: Optional[str] = None,\n merge_behavior: K8sConfigMergeBehavior = K8sConfigMergeBehavior.SHALLOW,\n):\n """This function is a utility for executing a Kubernetes job from within a Dagster op.\n\n Args:\n image (str): The image in which to launch the k8s job.\n command (Optional[List[str]]): The command to run in the container within the launched\n k8s job. Default: None.\n args (Optional[List[str]]): The args for the command for the container. Default: None.\n namespace (Optional[str]): Override the kubernetes namespace in which to run the k8s job.\n Default: None.\n image_pull_policy (Optional[str]): Allows the image pull policy to be overridden, e.g. to\n facilitate local testing with `kind <https://kind.sigs.k8s.io/>`_. Default:\n ``"Always"``. See:\n https://kubernetes.io/docs/concepts/containers/images/#updating-images.\n image_pull_secrets (Optional[List[Dict[str, str]]]): Optionally, a list of dicts, each of\n which corresponds to a Kubernetes ``LocalObjectReference`` (e.g.,\n ``{'name': 'myRegistryName'}``). This allows you to specify the ```imagePullSecrets`` on\n a pod basis. Typically, these will be provided through the service account, when needed,\n and you will not need to pass this argument. See:\n https://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\n and https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core\n service_account_name (Optional[str]): The name of the Kubernetes service account under which\n to run the Job. Defaults to "default" env_config_maps (Optional[List[str]]): A list of custom ConfigMapEnvSource names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container\n env_secrets (Optional[List[str]]): A list of custom Secret names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n env_vars (Optional[List[str]]): A list of environment variables to inject into the Job.\n Default: ``[]``. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n volume_mounts (Optional[List[Permissive]]): A list of volume mounts to include in the job's\n container. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core\n volumes (Optional[List[Permissive]]): A list of volumes to include in the Job's Pod. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core\n labels (Optional[Dict[str, str]]): Additional labels that should be included in the Job's Pod. See:\n https://kubernetes.io/docs/concepts/overview/working-with-objects/labels\n resources (Optional[Dict[str, Any]]) Compute resource requirements for the container. See:\n https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/\n scheduler_name (Optional[str]): Use a custom Kubernetes scheduler for launched Pods. See:\n https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/\n load_incluster_config (bool): Whether the op is running within a k8s cluster. If ``True``,\n we assume the launcher is running within the target cluster and load config using\n ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n back to the default kubeconfig. Default: True,\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n using the default kubeconfig. Default: None.\n timeout (Optional[int]): Raise an exception if the op takes longer than this timeout in\n seconds to execute. Default: None.\n container_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's main container\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core).\n Keys can either snake_case or camelCase.Default: None.\n pod_template_spec_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's\n metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n pod_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase. Default: None.\n job_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n job_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's job spec\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch).\n Keys can either snake_case or camelCase.Default: None.\n k8s_job_name (Optional[str]): Overrides the name of the the k8s job. If not set, will be set\n to a unique name based on the current run ID and the name of the calling op. If set,\n make sure that the passed in name is a valid Kubernetes job name that does not\n already exist in the cluster.\n merge_behavior (Optional[K8sConfigMergeBehavior]): How raw k8s config set on this op should\n be merged with any raw k8s config set on the code location that launched the op. By\n default, the value is K8sConfigMergeBehavior.SHALLOW, meaning that the two dictionaries\n are shallowly merged - any shared values in the dictionaries will be replaced by the\n values set on this op. Setting it to DEEP will recursively merge the two dictionaries,\n appending list fields together andmerging dictionary fields.\n """\n run_container_context = K8sContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, K8sRunLauncher)\n else None\n ),\n include_run_tags=False,\n )\n\n container_config = container_config.copy() if container_config else {}\n if command:\n container_config["command"] = command\n\n op_container_context = K8sContainerContext(\n image_pull_policy=image_pull_policy,\n image_pull_secrets=image_pull_secrets,\n service_account_name=service_account_name,\n env_config_maps=env_config_maps,\n env_secrets=env_secrets,\n env_vars=env_vars,\n volume_mounts=volume_mounts,\n volumes=volumes,\n labels=labels,\n namespace=namespace,\n resources=resources,\n scheduler_name=scheduler_name,\n run_k8s_config=UserDefinedDagsterK8sConfig.from_dict(\n {\n "container_config": container_config,\n "pod_template_spec_metadata": pod_template_spec_metadata,\n "pod_spec_config": pod_spec_config,\n "job_metadata": job_metadata,\n "job_spec_config": job_spec_config,\n "merge_behavior": merge_behavior.value,\n }\n ),\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n namespace = container_context.namespace\n\n user_defined_k8s_config = container_context.run_k8s_config\n\n k8s_job_config = DagsterK8sJobConfig(\n job_image=image,\n dagster_home=None,\n image_pull_policy=container_context.image_pull_policy,\n image_pull_secrets=container_context.image_pull_secrets,\n service_account_name=container_context.service_account_name,\n instance_config_map=None,\n postgres_password_secret=None,\n env_config_maps=container_context.env_config_maps,\n env_secrets=container_context.env_secrets,\n env_vars=container_context.env_vars,\n volume_mounts=container_context.volume_mounts,\n volumes=container_context.volumes,\n labels=container_context.labels,\n resources=container_context.resources,\n )\n\n job_name = k8s_job_name or get_k8s_job_name(\n context.run_id, context.get_step_execution_context().step.key\n )\n\n retry_number = context.retry_number\n if retry_number > 0:\n job_name = f"{job_name}-{retry_number}"\n\n labels = {\n "dagster/job": context.dagster_run.job_name,\n "dagster/op": context.op.name,\n "dagster/run-id": context.dagster_run.run_id,\n }\n if context.dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n context.dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=k8s_job_config,\n args=args,\n job_name=job_name,\n pod_name=job_name,\n component="k8s_job_op",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n )\n\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n # changing this to be able to be passed in will allow for unit testing\n api_client = DagsterKubernetesClient.production_client()\n\n context.log.info(f"Creating Kubernetes job {job_name} in namespace {namespace}...")\n\n start_time = time.time()\n\n api_client.batch_api.create_namespaced_job(namespace, job)\n\n context.log.info("Waiting for Kubernetes job to finish...")\n\n timeout = timeout or 0\n\n api_client.wait_for_job(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n restart_policy = user_defined_k8s_config.pod_spec_config.get("restart_policy", "Never")\n\n if restart_policy == "Never":\n container_name = container_config.get("name", "dagster")\n\n pods = api_client.wait_for_job_to_have_pods(\n job_name,\n namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n pod_names = [p.metadata.name for p in pods]\n\n if not pod_names:\n raise Exception("No pod names in job after it started")\n\n pod_to_watch = pod_names[0]\n watch = kubernetes.watch.Watch() # consider moving in to api_client\n\n api_client.wait_for_pod(\n pod_to_watch, namespace, wait_timeout=timeout, start_time=start_time\n )\n\n log_stream = watch.stream(\n api_client.core_api.read_namespaced_pod_log,\n name=pod_to_watch,\n namespace=namespace,\n container=container_name,\n )\n\n while True:\n if timeout and time.time() - start_time > timeout:\n watch.stop()\n raise Exception("Timed out waiting for pod to finish")\n\n try:\n log_entry = next(log_stream)\n print(log_entry) # noqa: T201\n except StopIteration:\n break\n else:\n context.log.info("Pod logs are disabled, because restart_policy is not Never")\n\n if job_spec_config and job_spec_config.get("parallelism"):\n num_pods_to_wait_for = job_spec_config["parallelism"]\n else:\n num_pods_to_wait_for = DEFAULT_JOB_POD_COUNT\n api_client.wait_for_running_job_to_succeed(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n num_pods_to_wait_for=num_pods_to_wait_for,\n )
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=K8S_JOB_OP_CONFIG)\n@experimental\ndef k8s_job_op(context):\n """An op that runs a Kubernetes job using the k8s API.\n\n Contrast with the `k8s_job_executor`, which runs each Dagster op in a Dagster job in its\n own k8s job.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in k8s.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_k8s_job_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_k8s_job` function\n inside your own op.\n\n The service account that is used to run this job should have the following RBAC permissions:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/kubernetes/k8s_job_op_rbac.yaml\n :language: YAML\n """\n if "merge_behavior" in context.op_config:\n merge_behavior = K8sConfigMergeBehavior(context.op_config.pop("merge_behavior"))\n else:\n merge_behavior = K8sConfigMergeBehavior.SHALLOW\n\n execute_k8s_job(context, merge_behavior=merge_behavior, **context.op_config)
\n
", "current_page_name": "_modules/dagster_k8s/ops/k8s_job_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.ops.k8s_job_op"}}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.pipes

\nimport random\nimport string\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Union\n\nimport kubernetes\nfrom dagster import (\n    OpExecutionContext,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.resource_annotation import ResourceParam\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n    PipesParams,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n)\nfrom dagster._core.pipes.utils import (\n    PipesEnvContextInjector,\n    extract_message_or_forward_to_stdout,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    PipesDefaultMessageWriter,\n    PipesExtras,\n)\n\nfrom dagster_k8s.utils import get_common_labels\n\nfrom .client import DagsterKubernetesClient, WaitForPodState\nfrom .models import k8s_model_from_dict, k8s_snake_case_dict\n\n\ndef get_pod_name(run_id: str, op_name: str):\n    clean_op_name = op_name.replace("_", "-")\n    suffix = "".join(random.choice(string.digits) for i in range(10))\n    return f"dagster-{run_id[:18]}-{clean_op_name[:20]}-{suffix}"\n\n\nDEFAULT_CONTAINER_NAME = "dagster-pipes-execution"\n\n\n
[docs]@experimental\nclass PipesK8sPodLogsMessageReader(PipesMessageReader):\n """Message reader that reads messages from kubernetes pod logs."""\n\n @contextmanager\n def read_messages(\n self,\n handler: PipesMessageHandler,\n ) -> Iterator[PipesParams]:\n self._handler = handler\n try:\n yield {PipesDefaultMessageWriter.STDIO_KEY: PipesDefaultMessageWriter.STDERR}\n finally:\n self._handler = None\n\n def consume_pod_logs(\n self,\n core_api: kubernetes.client.CoreV1Api,\n pod_name: str,\n namespace: str,\n ):\n handler = check.not_none(\n self._handler, "can only consume logs within scope of context manager"\n )\n for line in core_api.read_namespaced_pod_log(\n pod_name,\n namespace,\n follow=True,\n _preload_content=False, # avoid JSON processing\n ).stream():\n log_chunk = line.decode("utf-8")\n for log_line in log_chunk.split("\\n"):\n extract_message_or_forward_to_stdout(handler, log_line)\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages by extracting them from kubernetes pod logs directly."
\n\n\n@experimental\nclass _PipesK8sClient(PipesClient):\n """A pipes client for launching kubernetes pods.\n\n By default context is injected via environment variables and messages are parsed out of\n the pod logs, with other logs forwarded to stdout of the orchestration process.\n\n The first container within the containers list of the pod spec is expected (or set) to be\n the container prepared for pipes protocol communication.\n\n Args:\n env (Optional[Mapping[str, str]]): An optional dict of environment variables to pass to the\n subprocess.\n context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n context into the k8s container process. Defaults to :py:class:`PipesEnvContextInjector`.\n message_reader (Optional[PipesMessageReader]): A message reader to use to read messages\n from the k8s container process. Defaults to :py:class:`PipesK8sPodLogsMessageReader`.\n """\n\n def __init__(\n self,\n env: Optional[Mapping[str, str]] = None,\n context_injector: Optional[PipesContextInjector] = None,\n message_reader: Optional[PipesMessageReader] = None,\n ):\n self.env = check.opt_mapping_param(env, "env", key_type=str, value_type=str)\n self.context_injector = (\n check.opt_inst_param(\n context_injector,\n "context_injector",\n PipesContextInjector,\n )\n or PipesEnvContextInjector()\n )\n\n self.message_reader = (\n check.opt_inst_param(message_reader, "message_reader", PipesMessageReader)\n or PipesK8sPodLogsMessageReader()\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def run(\n self,\n *,\n context: OpExecutionContext,\n extras: Optional[PipesExtras] = None,\n image: Optional[str] = None,\n command: Optional[Union[str, Sequence[str]]] = None,\n namespace: Optional[str] = None,\n env: Optional[Mapping[str, str]] = None,\n base_pod_meta: Optional[Mapping[str, Any]] = None,\n base_pod_spec: Optional[Mapping[str, Any]] = None,\n ) -> PipesClientCompletedInvocation:\n """Publish a kubernetes pod and wait for it to complete, enriched with the pipes protocol.\n\n Args:\n image (Optional[str]):\n The image to set the first container in the pod spec to use.\n command (Optional[Union[str, Sequence[str]]]):\n The command to set the first container in the pod spec to use.\n namespace (Optional[str]):\n Which kubernetes namespace to use, defaults to "default"\n env (Optional[Mapping[str,str]]):\n A mapping of environment variable names to values to set on the first\n container in the pod spec, on top of those configured on resource.\n base_pod_meta (Optional[Mapping[str, Any]]:\n Raw k8s config for the k8s pod's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)\n Keys can either snake_case or camelCase. The name value will be overridden.\n base_pod_spec (Optional[Mapping[str, Any]]:\n Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase.\n extras (Optional[PipesExtras]):\n Extra values to pass along as part of the ext protocol.\n context_injector (Optional[PipesContextInjector]):\n Override the default ext protocol context injection.\n message_reader (Optional[PipesMessageReader]):\n Override the default ext protocol message reader.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """\n client = DagsterKubernetesClient.production_client()\n\n with open_pipes_session(\n context=context,\n extras=extras,\n context_injector=self.context_injector,\n message_reader=self.message_reader,\n ) as pipes_session:\n namespace = namespace or "default"\n pod_name = get_pod_name(context.run_id, context.op.name)\n pod_body = build_pod_body(\n pod_name=pod_name,\n image=image,\n command=command,\n env_vars={\n **pipes_session.get_bootstrap_env_vars(),\n **(self.env or {}),\n **(env or {}),\n },\n base_pod_meta=base_pod_meta,\n base_pod_spec=base_pod_spec,\n )\n client.core_api.create_namespaced_pod(namespace, pod_body)\n try:\n # if were doing direct pod reading, wait for pod to start and then stream logs out\n if isinstance(self.message_reader, PipesK8sPodLogsMessageReader):\n client.wait_for_pod(\n pod_name,\n namespace,\n wait_for_state=WaitForPodState.Ready,\n )\n self.message_reader.consume_pod_logs(\n core_api=client.core_api,\n pod_name=pod_name,\n namespace=namespace,\n )\n else:\n # if were not doing direct log reading, just wait for pod to finish\n client.wait_for_pod(\n pod_name,\n namespace,\n wait_for_state=WaitForPodState.Terminated,\n )\n finally:\n client.core_api.delete_namespaced_pod(pod_name, namespace)\n return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n\ndef build_pod_body(\n pod_name: str,\n image: Optional[str],\n command: Optional[Union[str, Sequence[str]]],\n env_vars: Mapping[str, str],\n base_pod_meta: Optional[Mapping[str, Any]],\n base_pod_spec: Optional[Mapping[str, Any]],\n):\n meta = {\n **(k8s_snake_case_dict(kubernetes.client.V1ObjectMeta, base_pod_meta or {})),\n "name": pod_name,\n }\n if "labels" in meta:\n meta["labels"] = {**get_common_labels(), **meta["labels"]}\n else:\n meta["labels"] = get_common_labels()\n\n spec = {**k8s_snake_case_dict(kubernetes.client.V1PodSpec, base_pod_spec or {})}\n if "containers" not in spec:\n spec["containers"] = [{}]\n\n if "restart_policy" not in spec:\n spec["restart_policy"] = "Never"\n elif spec["restart_policy"] == "Always":\n raise DagsterInvariantViolationError(\n "A restart policy of Always is not allowed, computations are expected to complete."\n )\n\n if "image" not in spec["containers"][0] and not image:\n raise DagsterInvariantViolationError(\n "Must specify image property or provide base_pod_spec with one set."\n )\n\n if "name" not in spec["containers"][0]:\n spec["containers"][0]["name"] = DEFAULT_CONTAINER_NAME\n\n if image:\n spec["containers"][0]["image"] = image\n\n if command:\n spec["containers"][0]["command"] = command\n\n if "env" not in spec["containers"][0]:\n spec["containers"][0]["env"] = []\n\n spec["containers"][0]["env"].extend({"name": k, "value": v} for k, v in env_vars.items())\n\n return k8s_model_from_dict(\n kubernetes.client.V1Pod,\n {\n "metadata": meta,\n "spec": spec,\n },\n )\n\n\nPipesK8sClient = ResourceParam[_PipesK8sClient]\n
", "current_page_name": "_modules/dagster_k8s/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.pipes"}}, "dagster_mlflow": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.hooks

\nfrom dagster._core.definitions.decorators.hook_decorator import event_list_hook\nfrom dagster._core.definitions.events import HookExecutionResult\nfrom mlflow.entities.run_status import RunStatus\n\n\ndef _create_mlflow_run_hook(name):\n    @event_list_hook(name=name, required_resource_keys={"mlflow"})\n    def _hook(context, event_list):\n        for event in event_list:\n            if event.is_step_success:\n                _cleanup_on_success(context)\n            elif event.is_step_failure:\n                mlf = context.resources.mlflow\n                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n        return HookExecutionResult(hook_name=name, is_skipped=False)\n\n    return _hook\n\n\ndef _cleanup_on_success(context):\n    """Checks if the current solid in the context is the last solid in the job\n    and ends the mlflow run with a successful status when this is the case.\n    """\n    last_solid_name = context._step_execution_context.job_def.nodes_in_topological_order[  # noqa: SLF001  # fmt: skip\n        -1\n    ].name\n\n    if context.op.name == last_solid_name:\n        context.resources.mlflow.end_run()\n\n\nend_mlflow_on_run_finished = _create_mlflow_run_hook("end_mlflow_on_run_finished")\n
", "current_page_name": "_modules/dagster_mlflow/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.resources

\n"""This module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom dagster import Field, Noneable, Permissive, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom mlflow.entities.run_status import RunStatus\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(StringSource, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(StringSource),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.dagster_run.job_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]\n\n    def _set_active_run(self, run_id=None):\n        """This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """Catches the Mlflow exception if a run is already active."""\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in the webserver fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your job to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_op(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in Dagster UI</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in webserver</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom dagster_msteams.client import TeamsClient\n\n\n
[docs]class MSTeamsResource(ConfigurableResource):\n """This resource is for connecting to Microsoft Teams.\n\n Provides a `dagster_msteams.TeamsClient` which can be used to\n interface with the MS Teams API.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster op,\n asset, schedule, or sensor:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job, Definitions, EnvVar\n from dagster_msteams import Card, MSTeamsResource\n\n\n @op\n def teams_op(msteams: MSTeamsResource):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n msteams.get_client().post_message(payload=card.payload)\n\n\n @job\n def teams_job():\n teams_op()\n\n defs = Definitions(\n jobs=[teams_job],\n resources={\n "msteams": MSTeamsResource(\n hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n )\n }\n )\n """\n\n hook_url: str = Field(\n default=None,\n description=(\n "To send messages to MS Teams channel, an incoming webhook has to be created. The"\n " incoming webhook url must be given as a part of the resource config to the"\n " MSTeamsResource in Dagster. For more information on how to create an incoming"\n " webhook, see"\n " https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook"\n ),\n )\n http_proxy: str = Field(default=None, description="HTTP proxy URL")\n https_proxy: str = Field(default=None, description="HTTPS proxy URL")\n timeout: float = Field(default=60, description="Timeout for requests to MS Teams")\n verify: bool = Field(\n default=True, description="Whether to verify SSL certificates, defaults to True"\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> TeamsClient:\n return TeamsClient(\n hook_url=self.hook_url,\n http_proxy=self.http_proxy,\n https_proxy=self.https_proxy,\n timeout=self.timeout,\n verify=self.verify,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=MSTeamsResource.to_config_schema(),\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context) -> TeamsClient:\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job\n from dagster_msteams import Card, msteams_resource\n\n\n @op(required_resource_keys={"msteams"})\n def teams_op(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @job(resource_defs={"msteams": msteams_resource})\n def teams_job():\n teams_op()\n\n\n teams_job.execute_in_process(\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n )\n """\n return MSTeamsResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster import DefaultSensorStatus\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n\n\ndef _default_failure_message(context: RunFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Job {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef make_teams_on_run_failure_sensor(\n hook_url: str,\n message_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on run failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None,\n which means the alert will be sent when any job in the repository matches the requested\n run_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n\n Examples:\n .. code-block:: python\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_job + teams_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return "Job {job_name} failed! Error: {error}".format(\n job_name=context.dagster_run.job_name,\n error=context.failure_event.message,\n )\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n webserver_base_url="http://localhost:3000",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @run_failure_sensor(\n name=name,\n default_status=default_status,\n monitored_jobs=monitored_jobs,\n monitor_all_repositories=monitor_all_repositories,\n )\n def teams_on_run_failure(context: RunFailureSensorContext):\n text = message_fn(context)\n if webserver_base_url:\n text += "<a href='{base_url}/runs/{run_id}'>View in Dagit</a>".format(\n base_url=webserver_base_url,\n run_id=context.dagster_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_run_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.event_log.event_log

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.exc as db_exc\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n self._mysql_version = self.get_server_version()\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLEventLogStorage":\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string: str) -> "MySQLEventLogStorage":\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def get_server_version(self) -> Optional[str]:\n with self.index_connection() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n # last_materialization_timestamp is updated upon observation, materialization, materialization_planned\n # See SqlEventLogStorage.store_asset_event method for more details\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n if values:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n **values,\n )\n .on_duplicate_key_update(\n **values,\n )\n )\n else:\n try:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n )\n )\n except db_exc.IntegrityError:\n pass\n\n def _connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n with self._connect() as conn:\n return table_name in db.inspect(conn).get_table_names()\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore # (possible none)\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n @property\n def event_watcher(self) -> SqlPollingEventWatcher:\n return self._event_watcher\n\n def __del__(self) -> None:\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nfrom typing import ContextManager, Mapping, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BUCKET_VERSION = "8.0.0"\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLRunStorage":\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLRunStorage":\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name: str) -> None:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n conn.execute(\n db_dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n insert_stmt = db_dialects.mysql.insert(KeyValueStoreTable).values(db_values)\n conn.execute(\n insert_stmt.on_duplicate_key_update(\n value=insert_stmt.inserted.value,\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BATCH_VERSION = "8.0.0"\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLScheduleStorage":\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLScheduleStorage":\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n @property\n def supports_batch_queries(self) -> bool:\n if not self._mysql_version:\n return False\n\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version(\n MINIMUM_MYSQL_BATCH_VERSION\n )\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.mysql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_duplicate_key_update(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pagerduty.resources

\nfrom typing import Dict, Optional, cast\n\nimport pypd\nfrom dagster import ConfigurableResource, resource\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.warnings import suppress_dagster_warnings\nfrom pydantic import Field as PyField\n\n\n
[docs]class PagerDutyService(ConfigurableResource):\n """This resource is for posting events to PagerDuty."""\n\n """Integrates with PagerDuty via the pypd library.\n\n See:\n https://v2.developer.pagerduty.com/docs/events-api-v2\n https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n https://github.com/PagerDuty/pagerduty-api-python-client\n\n for documentation and more information.\n """\n\n routing_key: str = PyField(\n ...,\n description=(\n "The routing key provisions access to your PagerDuty service. You"\n "will need to include the integration key for your new integration, as a"\n "routing_key in the event payload."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def EventV2_create(\n self,\n summary: str,\n source: str,\n severity: str,\n event_action: str = "trigger",\n dedup_key: Optional[str] = None,\n timestamp: Optional[str] = None,\n component: Optional[str] = None,\n group: Optional[str] = None,\n event_class: Optional[str] = None,\n custom_details: Optional[object] = None,\n ) -> object:\n """Events API v2 enables you to add PagerDuty's advanced event and incident management\n functionality to any system that can make an outbound HTTP connection.\n\n Args:\n summary (str):\n A high-level, text summary message of the event. Will be used to construct an\n alert's description. Example:\n\n "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n 'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n source (str):\n Specific human-readable unique identifier, such as a hostname, for the system having\n the problem. Examples:\n\n "prod05.theseus.acme-widgets.com"\n "171.26.23.22"\n "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n "9c09acd49a25"\n\n severity (str):\n How impacted the affected system is. Displayed to users in lists and influences the\n priority of any created incidents. Must be one of {info, warning, error, critical}\n\n Keyword Args:\n event_action (str):\n There are three types of events that PagerDuty recognizes, and are used to represent\n different types of activity in your monitored systems. (default: 'trigger')\n\n * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n or add a new trigger log entry to an existing alert, depending on the\n provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n when a new problem has been detected. You may send additional triggers\n when a previously detected problem has occurred again.\n\n * acknowledge: acknowledge events cause the referenced incident to enter the\n acknowledged state. While an incident is acknowledged, it won't\n generate any additional notifications, even if it receives new\n trigger events. Your monitoring tools should send PagerDuty an\n acknowledge event when they know someone is presently working on the\n problem.\n\n * resolve: resolve events cause the referenced incident to enter the resolved state.\n Once an incident is resolved, it won't generate any additional\n notifications. New trigger events with the same dedup_key as a resolved\n incident won't re-open the incident. Instead, a new incident will be\n created. Your monitoring tools should send PagerDuty a resolve event when\n the problem that caused the initial trigger event has been fixed.\n\n dedup_key (str):\n Deduplication key for correlating triggers and resolves. The maximum permitted\n length of this property is 255 characters.\n\n timestamp (str):\n Timestamp (ISO 8601). When the upstream system detected / created the event. This is\n useful if a system batches or holds events before sending them to PagerDuty. This\n will be auto-generated by PagerDuty if not provided. Example:\n\n 2015-07-17T08:42:58.315+0000\n\n component (str):\n The part or component of the affected system that is broken. Examples:\n\n "keepalive"\n "webping"\n "mysql"\n "wqueue"\n\n group (str):\n A cluster or grouping of sources. For example, sources "prod-datapipe-02" and\n "prod-datapipe-03" might both be part of "prod-datapipe". Examples:\n\n "prod-datapipe"\n "www"\n "web_stack"\n\n event_class (str):\n The class/type of the event. Examples:\n\n "High CPU"\n "Latency"\n "500 Error"\n\n custom_details (Dict[str, str]):\n Additional details about the event and affected system. Example:\n\n {"ping time": "1500ms", "load avg": 0.75 }\n """\n data = {\n "routing_key": self.routing_key,\n "event_action": event_action,\n "payload": {"summary": summary, "source": source, "severity": severity},\n }\n\n if dedup_key is not None:\n data["dedup_key"] = dedup_key\n\n payload: Dict[str, object] = cast(Dict[str, object], data["payload"])\n\n if timestamp is not None:\n payload["timestamp"] = timestamp\n\n if component is not None:\n payload["component"] = component\n\n if group is not None:\n payload["group"] = group\n\n if event_class is not None:\n payload["class"] = event_class\n\n if custom_details is not None:\n payload["custom_details"] = custom_details\n\n return pypd.EventV2.create(data=data)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=infer_schema_from_config_class(PagerDutyService),\n description="""This resource is for posting events to PagerDuty.""",\n)\n@suppress_dagster_warnings\ndef pagerduty_resource(context) -> PagerDutyService:\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n .. code-block:: python\n\n @op\n def pagerduty_op(pagerduty: PagerDutyService):\n pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom dagster import (\n    DagsterType,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom pandas import DataFrame\nfrom typing_extensions import Final\n\nCONSTRAINT_METADATA_KEY: Final = "constraint_metadata"\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return {\n            CONSTRAINT_METADATA_KEY: {\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        }\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata=self.convert_to_metadata()\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            f"Violated {constraint_name} - {constraint_description}"\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = (\n            'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'\n            .format(\n                constraint_name=self.constraint_name,\n                constraint_description=self.constraint_description,\n                column_name=self.column_name,\n            )\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\n@experimental\nclass ConstraintWithMetadata:\n    """This class defines a base constraint over pandas DFs with organized metadata.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description=f"A Pandas DataFrame with the following validation: {self.description}",\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """Use this class if you have multiple constraints to check over the entire dataframe.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = f"ensuring that the right columns, {self.column_list} were present"\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = f"No columns outside of {self.strict_column_list} allowed. "\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected the following ordering of columns {expected}. Received:"\n " {received}".format(\n expected=self.strict_column_list, received=columns_received\n )\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = f"Dataframe must have {self.num_allowed_rows} +- {self.error_tolerance} rows."\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n )\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if res[1].get("actual") is not None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """This class is useful for constructing single constraints that you want to apply to multiple\n columns of your dataframe.\n\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly, and also allows for cases like 'fail if any one of\n these constraints fails but still run all of them'.\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """Validates that a particular value in a column is not null.\n\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """Validates that all values in an iterable are unique.\n\n Returns duplicated values as metadata.\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """Decorator for column validation functions to make them error on nulls.\n\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls.\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """Factory for validators testing if column values are within a range.\n\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = f"checks whether values are between {minim} and {maxim}"\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """Factory for validators testing if all values are in some set.\n\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n f"checks whether values are within this set of values: {categories}"\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """Factory for testing if the dtype of a val falls within some allowed set.\n\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = f"checks whether values are this type/types: {datatypes}"\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = f"Confirms values are between {minim} and {maxim}"\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = f"Column dtype must be in the following set {self.expected_dtype_set}."\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n f"{self.error_description}. DTypes received: {received_dtypes}"\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are not null."""\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description=f"Expected Categories are {self.categories}",\n markdown_description=f"Category examples are {self.categories[:5]}...",\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description=f"values > {self.min_value}",\n error_description=f"Column must have values > {self.min_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description=f"values < {self.max_value}",\n error_description=f"Column must have values < {self.max_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description=f"{self.min_value} < values < {self.max_value}",\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataValue,\n    StringSource,\n    TableColumn,\n    TableSchema,\n    TableSchemaMetadataValue,\n    TypeCheck,\n    _check as check,\n    dagster_type_loader,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import Selector\nfrom dagster._core.definitions.metadata import normalize_metadata\nfrom dagster._utils import dict_without_keys\n\nfrom dagster_pandas.constraints import (\n    CONSTRAINT_METADATA_KEY,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = next(iter(config.items()))\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(f"Unsupported file_type {file_type}")\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata={\n            "row_count": str(len(value)),\n            # string cast columns since they may be things like datetime\n            "metadata": {"columns": list(map(str, value.columns))},\n        },\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    type_check_fn=df_type_check,\n    typing_type=pd.DataFrame,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + f"+ {constraint_description}\\n"\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = f"**{column_name}**"\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + f": `{dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]}`"\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + f": Validator `{constraint.type_fn.__name__}`"\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\ndef create_table_schema_metadata_from_dataframe(\n    pandas_df: pd.DataFrame,\n) -> TableSchemaMetadataValue:\n    """This function takes a pandas DataFrame and returns its metadata as a Dagster TableSchema.\n\n    Args:\n        pandas_df (pandas.DataFrame): A pandas DataFrame for which to create metadata.\n\n    Returns:\n        TableSchemaMetadataValue: returns an object with the TableSchema for the DataFrame.\n    """\n    check.inst(pandas_df, pd.DataFrame, "Input must be a pandas DataFrame object")\n    return MetadataValue.table_schema(\n        TableSchema(\n            columns=[\n                TableColumn(name=str(name), type=str(dtype))\n                for name, dtype in pandas_df.dtypes.items()\n            ]\n        )\n    )\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n):\n """Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]])\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values.\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n """\n # We allow for the plugging in of a dagster_type_loader so that users can load their custom\n # dataframes via configuration their own way if the default configs don't suffice. This is\n # purely optional.\n check.str_param(name, "name")\n metadata_fn = check.opt_callable_param(metadata_fn, "metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n\n try:\n validate_constraints(\n value,\n pandas_columns=columns,\n dataframe_constraints=dataframe_constraints,\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata=_execute_summary_stats(name, value, metadata_fn) if metadata_fn else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n typing_type=pd.DataFrame,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n):\n """Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = {}\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n metadata[f"{key}-constraint-metadata"] = MetadataValue.json(result_dict)\n constraint_clauses.append(f"{key} failing constraints, {result.description}")\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata=metadata,\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, metadata_fn):\n if not metadata_fn:\n return []\n\n user_metadata = metadata_fn(value)\n try:\n return normalize_metadata(user_metadata)\n except:\n raise DagsterInvariantViolationError(\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Dict[str, RawMetadataValue]."\n )\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.validation

\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n f"Required column {self.name} not in dataframe with columns {dataframe.columns}"\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.event_log.event_log

\nfrom typing import Any, ContextManager, Mapping, Optional, Sequence\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import pg_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    DynamicPartitionsTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.event_log.polling_event_watcher import SqlPollingEventWatcher\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, deserialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your event log storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n self._secondary_index_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, Any]\n ) -> "PostgresEventLogStorage":\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n conn_string: str, should_autocreate_tables: bool = True\n ) -> "PostgresEventLogStorage":\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n\n # LISTEN/NOTIFY no longer used for pg event watch - preserved here to support version skew\n conn.execute(\n db.text(f"""NOTIFY {CHANNEL_NAME}, :notify_id; """),\n {"notify_id": res[0] + "_" + str(res[1])}, # type: ignore\n )\n event_id = int(res[1]) # type: ignore\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # job, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n query = db_dialects.postgresql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n if values:\n query = query.on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(**values),\n )\n else:\n query = query.on_conflict_do_nothing()\n conn.execute(query)\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n if not partition_keys:\n return\n\n # Overload base implementation to push upsert logic down into the db layer\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n db_dialects.postgresql.insert(DynamicPartitionsTable)\n .values(\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in partition_keys\n ]\n )\n .on_conflict_do_nothing(),\n )\n\n def _connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n return bool(self._engine.dialect.has_table(self._engine.connect(), table_name))\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(\n self,\n run_id: str,\n cursor: Optional[str],\n callback: EventHandlerFn,\n ) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def _gen_event_log_entry_from_cursor(self, cursor) -> EventLogEntry:\n with self._engine.connect() as conn:\n cursor_res = conn.execute(\n db_select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == cursor\n ),\n )\n return deserialize_value(cursor_res.scalar(), EventLogEntry) # type: ignore\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self) -> None:\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport zlib\nfrom typing import ContextManager, Mapping, Optional\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable, SnapshotsTable\nfrom dagster._core.storage.runs.sql_run_storage import SnapshotType\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your run storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresRunStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name: str) -> bool:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db_dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_value(daemon_heartbeat),\n },\n )\n .returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n DaemonHeartbeatsTable.c.daemon_type,\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n\n # pg speciic on_conflict_do_update\n insert_stmt = db_dialects.postgresql.insert(KeyValueStoreTable).values(\n [{"key": k, "value": v} for k, v in pairs.items()]\n )\n upsert_stmt = insert_stmt.on_conflict_do_update(\n index_elements=[\n KeyValueStoreTable.c.key,\n ],\n set_={"value": insert_stmt.excluded.value},\n ).returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n KeyValueStoreTable.c.key\n )\n\n with self.connect() as conn:\n conn.execute(upsert_stmt)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n with self.connect() as conn:\n snapshot_insert = (\n db_dialects.postgresql.insert(SnapshotsTable)\n .values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n .on_conflict_do_nothing()\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.scheduler.instigation import InstigatorState\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your schedule storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ) -> "PostgresScheduleStorage":\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresScheduleStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.postgresql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_conflict_do_update(\n index_elements=[InstigatorsTable.c.selector_id],\n set_={\n "status": state.status.value,\n "instigator_type": state.instigator_type.value,\n "instigator_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n },\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom dagster import (\n    ConfigurableResource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom prometheus_client.exposition import default_handler\nfrom pydantic import Field, PrivateAttr\n\n\n
[docs]class PrometheusClient:\n """Integrates with Prometheus via the prometheus_client library."""
\n\n\n
[docs]class PrometheusResource(ConfigurableResource):\n """This resource is used to send metrics to a Prometheus Pushgateway.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster_prometheus import PrometheusResource\n from dagster import Definitions, job, op\n\n @op\n def example_prometheus_op(prometheus: PrometheusResource):\n prometheus.push_to_gateway(job="my_job")\n\n @job\n def my_job():\n example_prometheus_op()\n\n defs = Definitions(\n jobs=[my_job],\n resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n )\n\n """\n\n gateway: str = Field(\n description=(\n "The url for your push gateway. Either of the"\n " form 'http://pushgateway.local', or 'pushgateway.local'."\n " Scheme defaults to 'http' if none is provided"\n )\n )\n timeout: int = Field(\n default=30,\n description="is how long delete will attempt to connect before giving up. Defaults to 30s.",\n )\n _registry: prometheus_client.CollectorRegistry = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._registry = prometheus_client.CollectorRegistry()\n\n @property\n def registry(self) -> prometheus_client.CollectorRegistry:\n return self._registry\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method.\n """\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method.\n """\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method.\n """\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=PrometheusResource.to_config_schema(),\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pyspark.resources

\nfrom typing import Any, Dict\n\nimport dagster._check as check\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pydantic import PrivateAttr\nfrom pyspark.sql import SparkSession\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\n
[docs]class PySparkResource(ConfigurableResource):\n """This resource provides access to a PySpark Session for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(pyspark: PySparkResource)\n spark_session = pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n @job(\n resource_defs={\n "pyspark": PySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n return self.spark_session.sparkContext
\n\n\n
[docs]@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context) -> PySparkResource:\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return PySparkResource.from_resource_context(context_updated_config)
\n\n\nclass LazyPySparkResource(ConfigurableResource):\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(lazy_pyspark: LazyPySparkResource)\n spark_session = lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n @job(\n resource_defs={\n "lazy_pyspark": LazyPySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _init_session(self) -> None:\n if self._spark_session is None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n self._init_session()\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n self._init_session()\n return self._spark_session.sparkContext\n\n\n@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef lazy_pyspark_resource(init_context: InitResourceContext) -> LazyPySparkResource:\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"lazy_pyspark"})\n def my_op(context):\n spark_session = context.resources.lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = lazy_pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"lazy_pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return LazyPySparkResource.from_resource_context(context_updated_config)\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.ops

\nimport os\nfrom enum import Enum\nfrom typing import AbstractSet, Any, Dict, Mapping, Optional\n\nfrom dagster import (\n    Config,\n    Failure,\n    In,\n    Nothing,\n    OpExecutionContext,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom pydantic import Field\n\nfrom .utils import execute, execute_script_file\n\n\nclass OutputType(Enum):\n    STREAM = "STREAM"\n    """Stream script stdout/stderr."""\n\n    BUFFER = "BUFFER"\n    """Buffer shell script stdout/stderr, then log upon completion."""\n\n    NONE = "NONE"\n    """No logging."""\n\n\nclass ShellOpConfig(Config):\n    env: Optional[Dict[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n    output_logging: OutputType = Field(\n        OutputType.BUFFER.value,\n    )\n    cwd: Optional[str] = Field(\n        default=None, description="Working directory in which to execute shell script"\n    )\n\n    def to_execute_params(self) -> Dict[str, Any]:\n        return {\n            "env": {**os.environ, **(self.env or {})},\n            "output_logging": self.output_logging.value,\n            "cwd": self.cwd,\n        }\n\n\n
[docs]@op(\n name="shell_op",\n description=(\n "This op executes a shell command it receives as input.\\n\\n"\n "This op is suitable for uses where the command to execute is generated dynamically by "\n "upstream ops. If you know the command to execute at job construction time, "\n "consider `shell_command_op` instead."\n ),\n ins={"shell_command": In(str)},\n out=Out(str),\n)\ndef shell_op(context: OpExecutionContext, shell_command: str, config: ShellOpConfig) -> str:\n """This op executes a shell command it receives as input.\n This op is suitable for uses where the command to execute is generated dynamically by\n upstream ops. If you know the command to execute at job construction time,\n consider ``shell_command_op`` instead.\n\n Args:\n shell_command: The shell command to be executed\n config (ShellOpConfig): A ShellOpConfig object specifying configuration options\n\n Examples:\n .. code-block:: python\n\n @op\n def create_shell_command():\n return "echo hello world!"\n\n @graph\n def echo_graph():\n shell_op(create_shell_command())\n """\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output
\n\n\n
[docs]def create_shell_command_op(\n shell_command: str,\n name: str,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n) -> OpDefinition:\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at job construction time. If you'd like to construct shell commands dynamically during\n job execution and pass them between ops, you should use ``shell_op`` instead.\n\n The resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n\n @op(\n name=name,\n description=description,\n ins={"start": In(Nothing)},\n out=Out(str),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context, config: ShellOpConfig):\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_fn
\n\n\n
[docs]def create_shell_script_op(\n shell_script_path,\n name="create_shell_script_op",\n ins: Optional[Mapping[str, In]] = None,\n **kwargs: Any,\n) -> OpDefinition:\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n If no ``ins`` are passed then the resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (Optional[str]): The name of this op. Defaults to "create_shell_script_op".\n ins (Optional[Mapping[str, In]]): Ins for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_mapping_param(ins, "ins", value_type=In)\n\n if "config" in kwargs:\n raise TypeError("Overriding config for shell op is not supported.")\n\n @op(\n name=name,\n description=kwargs.pop("description", "An op to invoke a shell command."),\n ins=ins or {"start": In(Nothing)},\n out=Out(str),\n **kwargs,\n )\n def _shell_script_fn(context, config: ShellOpConfig):\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_script_fn
\n
", "current_page_name": "_modules/dagster_shell/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.ops"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.utils

\n#\n# NOTE: This file is based on the bash operator from Apache Airflow, which can be found here:\n# https://github.com/apache/airflow/blob/master/airflow/operators/bash.py\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# "License"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\nimport os\nimport signal\nfrom logging import Logger\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Mapping, Optional, Tuple\n\nimport dagster._check as check\nfrom dagster._utils import safe_tempfile_path\nfrom typing_extensions import Final\n\nOUTPUT_LOGGING_OPTIONS: Final = ["STREAM", "BUFFER", "NONE"]\n\n\ndef execute_script_file(\n    shell_script_path: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """Execute a shell script file specified by the argument ``shell_script_path``. The script will be\n    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_utility.py\n           :language: python\n\n    Args:\n        shell_script_path (str): The shell script to execute.\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Raises:\n        Exception: When an invalid output_logging is selected. Unreachable from op-based\n            invocation since the config system will check output_logging against the config\n            enum.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_script_path, "shell_script_path")\n    check.str_param(output_logging, "output_logging")\n    check.opt_str_param(cwd, "cwd", default=os.path.dirname(shell_script_path))\n    env = check.opt_nullable_dict_param(env, "env", key_type=str, value_type=str)\n\n    if output_logging not in OUTPUT_LOGGING_OPTIONS:\n        raise Exception("Unrecognized output_logging %s" % output_logging)\n\n    def pre_exec():\n        # Restore default signal disposition and invoke setsid\n        for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):\n            if hasattr(signal, sig):\n                signal.signal(getattr(signal, sig), signal.SIG_DFL)\n        os.setsid()\n\n    with open(shell_script_path, "rb") as f:\n        shell_command = f.read().decode("utf-8")\n\n    log.info(f"Running command:\\n{shell_command}")\n\n    sub_process = None\n    try:\n        stdout_pipe = PIPE\n        stderr_pipe = STDOUT\n        if output_logging == "NONE":\n            stdout_pipe = stderr_pipe = None\n\n        sub_process = Popen(\n            ["bash", shell_script_path],\n            stdout=stdout_pipe,\n            stderr=stderr_pipe,\n            cwd=cwd,\n            env=env,\n            preexec_fn=pre_exec,  # noqa: PLW1509\n            encoding="UTF-8",\n        )\n\n        log.info(f"Command pid: {sub_process.pid}")\n\n        output = ""\n        if output_logging == "STREAM":\n            assert sub_process.stdout is not None, "Setting stdout=PIPE should always set stdout."\n            # Stream back logs as they are emitted\n            lines = []\n            for line in sub_process.stdout:\n                log.info(line.rstrip())\n                lines.append(line)\n            output = "".join(lines)\n        elif output_logging == "BUFFER":\n            # Collect and buffer all logs, then emit\n            output, _ = sub_process.communicate()\n            log.info(output)\n\n        sub_process.wait()\n        log.info(f"Command exited with return code {sub_process.returncode}")\n\n        return output, sub_process.returncode\n    finally:\n        # Always terminate subprocess, including in cases where the run is terminated\n        if sub_process:\n            sub_process.terminate()\n\n\ndef execute(\n    shell_command: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\n    It can be used to execute shell commands on either op input data, or any data generated within a generic python op.\n\n    Internally, it executes a shell script specified by the argument ``shell_command``. The script will be written\n    to a temporary file first and invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_utility.py\n           :language: python\n\n    Args:\n        shell_command (str): The shell command to execute\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_command, "shell_command")\n    # other args checked in execute_file\n\n    with safe_tempfile_path() as tmp_file_path:\n        tmp_path = os.path.dirname(tmp_file_path)\n        log.info("Using temporary directory: %s" % tmp_path)\n\n        with open(tmp_file_path, "wb") as tmp_file:\n            tmp_file.write(shell_command.encode("utf-8"))\n            tmp_file.flush()\n            script_location = os.path.abspath(tmp_file.name)\n            log.info(f"Temporary script location: {script_location}")\n            return execute_script_file(\n                shell_script_path=tmp_file.name,\n                output_logging=output_logging,\n                log=log,\n                cwd=(cwd or tmp_path),\n                env=env,\n            )\n
", "current_page_name": "_modules/dagster_shell/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.utils"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\nfrom slack_sdk.web.client import WebClient\n\n\n
[docs]class SlackResource(ConfigurableResource):\n """This resource is for connecting to Slack.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import EnvVar, job, op\n from dagster_slack import SlackResource\n\n\n @op\n def slack_op(slack: SlackResource):\n slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job\n def slack_job():\n slack_op()\n\n defs = Definitions(\n jobs=[slack_job],\n resources={\n "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n },\n )\n """\n\n token: str = Field(\n description=(\n "To configure access to the Slack API, you'll need an access"\n " token provisioned with access to your Slack workspace."\n " Tokens are typically either user tokens or bot tokens. For programmatic posting"\n " to Slack from this resource, you probably want to provision and use a bot token."\n " More in the Slack API documentation here: https://api.slack.com/docs/token-types"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> WebClient:\n """Returns a ``slack_sdk.WebClient`` for interacting with the Slack API."""\n return WebClient(self.token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SlackResource.to_config_schema(),\n)\ndef slack_resource(context) -> WebClient:\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n """\n return SlackResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.sensors

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n)\n\nfrom dagster import (\n    AssetSelection,\n    DefaultSensorStatus,\n    FreshnessPolicySensorContext,\n    freshness_policy_sensor,\n)\nfrom dagster._annotations import deprecated_param, experimental\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\nfrom slack_sdk.web.client import WebClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nT = TypeVar("T", RunFailureSensorContext, FreshnessPolicySensorContext)\n\n\ndef _build_slack_blocks_and_text(\n    context: T,\n    text_fn: Callable[[T], str],\n    blocks_fn: Optional[Callable[[T], List[Dict[Any, Any]]]],\n    webserver_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    main_body_text = text_fn(context)\n    blocks: List[Dict[Any, Any]] = []\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        if isinstance(context, RunFailureSensorContext):\n            text = (\n                f'*Job "{context.dagster_run.job_name}" failed.'\n                f' `{context.dagster_run.run_id.split("-")[0]}`*'\n            )\n        else:\n            text = (\n                f'*Asset "{context.asset_key.to_user_string()}" is now'\n                f' {"on time" if context.minutes_overdue == 0 else f"{context.minutes_overdue:.2f} minutes late.*"}'\n            )\n\n        blocks.extend(\n            [\n                {\n                    "type": "section",\n                    "text": {\n                        "type": "mrkdwn",\n                        "text": text,\n                    },\n                },\n                {\n                    "type": "section",\n                    "text": {"type": "mrkdwn", "text": main_body_text},\n                },\n            ]\n        )\n\n    if webserver_base_url:\n        if isinstance(context, RunFailureSensorContext):\n            url = f"{webserver_base_url}/runs/{context.dagster_run.run_id}"\n        else:\n            url = f"{webserver_base_url}/assets/{'/'.join(context.asset_key.path)}"\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagster UI"},\n                        "url": url,\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(context: RunFailureSensorContext) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): The jobs in the\n current repository that will be monitored by this failure sensor. Defaults to None, which\n means the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): (deprecated in favor of monitored_jobs)\n The jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n\n Examples:\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.dagster_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n minimum_interval_seconds=minimum_interval_seconds,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n )\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n\n\ndef _default_freshness_message_text_fn(context: FreshnessPolicySensorContext) -> str:\n return (\n f"Asset `{context.asset_key.to_user_string()}` is now {context.minutes_overdue:.2f} minutes"\n " late."\n )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@experimental\ndef make_slack_on_freshness_policy_status_change_sensor(\n channel: str,\n slack_token: str,\n asset_selection: AssetSelection,\n warn_after_minutes_overdue: float = 0,\n notify_when_back_on_time: bool = False,\n text_fn: Callable[[FreshnessPolicySensorContext], str] = _default_freshness_message_text_fn,\n blocks_fn: Optional[Callable[[FreshnessPolicySensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor that will message the given Slack channel whenever an asset in the provided\n AssetSelection becomes out of date. Messages are only fired when the state changes, meaning\n only a single slack message will be sent (when the asset begins to be out of date). If\n `notify_when_back_on_time` is set to `True`, a second slack message will be sent once the asset\n is on time again.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n asset_selection (AssetSelection): The selection of assets which this sensor will monitor.\n Alerts will only be fired for assets that have a FreshnessPolicy defined.\n warn_after_minutes_overdue (float): How many minutes past the specified FreshnessPolicy this\n sensor will wait before firing an alert (by default, an alert will be fired as soon as\n the policy is violated).\n notify_when_back_on_time (bool): If a success message should be sent when the asset becomes on\n time again.\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``FreshnessPolicySensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains the relevant asset key, and the number of\n minutes past its defined freshness policy it currently is.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]): Function which takes in\n the ``FreshnessPolicySensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_freshness_policy".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n\n Examples:\n .. code-block:: python\n\n slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN"),\n )\n\n .. code-block:: python\n\n def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n if context.minutes_overdue == 0:\n return f"Asset {context.asset_key} is currently on time :)"\n return (\n f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n\n @freshness_policy_sensor(\n name=name, asset_selection=asset_selection, default_status=default_status\n )\n def slack_on_freshness_policy(context: FreshnessPolicySensorContext):\n if context.minutes_overdue is None or context.previous_minutes_overdue is None:\n return\n\n if (\n context.minutes_overdue > warn_after_minutes_overdue\n and context.previous_minutes_overdue <= warn_after_minutes_overdue\n ) or (\n notify_when_back_on_time\n and context.minutes_overdue == 0\n and context.previous_minutes_overdue != 0\n ):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_freshness_policy
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.ops

\nfrom dagster import (\n    Nothing,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.input import In\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        ins={"start": In(Nothing)},\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at job construction time. If you'd like to execute queries dynamically during\n    job execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(op, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.resources

\nimport base64\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\nfrom typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom cryptography.hazmat.backends import default_backend\nfrom cryptography.hazmat.primitives import serialization\nfrom dagster import (\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.storage.event_log.sql_event_log import SqlDbConnection\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field, root_validator, validator\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """A resource for connecting to the Snowflake data warehouse.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n object. If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import SnowflakeResource\n\n @op\n def get_one(snowflake_resource: SnowflakeResource):\n with snowflake_resource.get_connection() as conn:\n # conn is a snowflake.connector.Connection object\n conn.cursor().execute("SELECT 1")\n\n @job\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n resources={\n 'snowflake_resource': SnowflakeResource(\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n user=EnvVar("SNOWFLAKE_USER"),\n password=EnvVar("SNOWFLAKE_PASSWORD")\n database="MY_DATABASE",\n schema="MY_SCHEMA",\n warehouse="MY_WAREHOUSE"\n )\n }\n )\n """\n\n account: Optional[str] = Field(\n default=None,\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n\n user: str = Field(description="User login name.")\n\n password: Optional[str] = Field(default=None, description="User password.")\n\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use ``USE DATABASE`` "\n " to change the database."\n ),\n )\n\n schema_: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default schema to use. After login, you can use ``USE SCHEMA`` to "\n "change the schema."\n ),\n alias="schema",\n ) # schema is a reserved word for pydantic\n\n role: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default role to use. After login, you can use ``USE ROLE`` to change "\n " the role."\n ),\n )\n\n warehouse: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default warehouse to use. After login, you can use ``USE WAREHOUSE`` "\n "to change the role."\n ),\n )\n\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set private_key_path and private_key_password. To avoid issues with"\n " newlines in the keys, you can base64 encode the key. You can retrieve the base64"\n " encoded key with this shell command: ``cat rsa_key.p8 | base64``"\n ),\n )\n\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key password to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both ``private_key`` and ``private_key_path`` if the private key is"\n " encrypted. For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key path to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set the raw private key as ``private_key``."\n ),\n )\n\n autocommit: Optional[bool] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True "\n "or False to enable or disable autocommit mode in the session, respectively."\n ),\n )\n\n client_prefetch_threads: Optional[int] = Field(\n default=None,\n description=(\n "Number of threads used to download the results sets (4 by default). "\n "Increasing the value improves fetch performance but requires more memory."\n ),\n )\n\n client_session_keep_alive: Optional[bool] = Field(\n default=None,\n description=(\n "False by default. Set this to True to keep the session active indefinitely, "\n "even if there is no activity from the user. Make certain to call the close method to "\n "terminate the thread properly or the process may hang."\n ),\n )\n\n login_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for login. By default, 60 seconds. The login request gives "\n 'up after the timeout length if the HTTP response is "success".'\n ),\n )\n\n network_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for all other operations. By default, none/infinite. A general"\n " request gives up after the timeout length if the HTTP response is not 'success'."\n ),\n )\n\n ocsp_response_cache_filename: Optional[str] = Field(\n default=None,\n description=(\n "URI for the OCSP response cache file. By default, the OCSP response cache "\n "file is created in the cache directory."\n ),\n )\n\n validate_default_parameters: Optional[bool] = Field(\n default=None,\n description=(\n "If True, raise an exception if the warehouse, database, or schema doesn't exist."\n " Defaults to False."\n ),\n )\n\n paramstyle: Optional[str] = Field(\n default=None,\n description=(\n "pyformat by default for client side binding. Specify qmark or numeric to "\n "change bind variable formats for server side binding."\n ),\n )\n\n timezone: Optional[str] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter TIMEZONE. Set to a "\n "valid time zone (e.g. America/Los_Angeles) to set the session time zone."\n ),\n )\n\n connector: Optional[str] = Field(\n default=None,\n description=(\n "Indicate alternative database connection engine. Permissible option is "\n "'sqlalchemy' otherwise defaults to use the Snowflake Connector for Python."\n ),\n is_required=False,\n )\n\n cache_column_metadata: Optional[str] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a"\n " flag ``cache_column_metadata=True`` such that all of column metadata for all tables"\n ' are "cached"'\n ),\n )\n\n numpy: Optional[bool] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. To enable fetching "\n "NumPy data types, add numpy=True to the connection parameters."\n ),\n )\n\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @validator("paramstyle")\n def validate_paramstyle(cls, v: Optional[str]) -> Optional[str]:\n valid_config = ["pyformat", "qmark", "numeric"]\n if v is not None and v not in valid_config:\n raise ValueError(\n "Snowflake Resource: 'paramstyle' configuration value must be one of:"\n f" {','.join(valid_config)}."\n )\n return v\n\n @validator("connector")\n def validate_connector(cls, v: Optional[str]) -> Optional[str]:\n if v is not None and v != "sqlalchemy":\n raise ValueError(\n "Snowflake Resource: 'connector' configuration value must be None or sqlalchemy."\n )\n return v\n\n @root_validator\n def validate_authentication(cls, values):\n auths_set = 0\n auths_set += 1 if values.get("password") is not None else 0\n auths_set += 1 if values.get("private_key") is not None else 0\n auths_set += 1 if values.get("private_key_path") is not None else 0\n\n # if authenticator is set, there can be 0 or 1 additional auth method;\n # otherwise, ensure at least 1 method is provided\n check.invariant(\n auths_set > 0 or values.get("authenticator") is not None,\n "Missing config: Password, private key, or authenticator authentication required"\n " for Snowflake resource.",\n )\n\n # ensure that only 1 non-authenticator method is provided\n check.invariant(\n auths_set <= 1,\n "Incorrect config: Cannot provide both password and private key authentication to"\n " Snowflake Resource.",\n )\n\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _connection_args(self) -> Mapping[str, Any]:\n conn_args = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n "authenticator",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n if (\n self._resolved_config_dict.get("private_key", None) is not None\n or self._resolved_config_dict.get("private_key_path", None) is not None\n ):\n conn_args["private_key"] = self._snowflake_private_key(self._resolved_config_dict)\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_connection_args(self) -> Mapping[str, Any]:\n conn_args: Dict[str, Any] = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_engine_args(self) -> Mapping[str, Any]:\n config = self._resolved_config_dict\n sqlalchemy_engine_args = {}\n if (\n config.get("private_key", None) is not None\n or config.get("private_key_path", None) is not None\n ):\n # sqlalchemy passes private key args separately, so store them in a new dict\n sqlalchemy_engine_args["private_key"] = self._snowflake_private_key(config)\n if config.get("authenticator", None) is not None:\n sqlalchemy_engine_args["authenticator"] = config["authenticator"]\n\n return sqlalchemy_engine_args\n\n def _snowflake_private_key(self, config) -> bytes:\n # If the user has defined a path to a private key, we will use that.\n if config.get("private_key_path", None) is not None:\n # read the file from the path.\n with open(config.get("private_key_path"), "rb") as key:\n private_key = key.read()\n else:\n private_key = config.get("private_key", None)\n\n kwargs = {}\n if config.get("private_key_password", None) is not None:\n kwargs["password"] = config["private_key_password"].encode()\n else:\n kwargs["password"] = None\n\n try:\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except TypeError:\n try:\n private_key = base64.b64decode(private_key)\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except ValueError:\n raise ValueError(\n "Unable to load private key. You may need to base64 encode your private key."\n " You can retrieve the base64 encoded key with this shell command: cat"\n " rsa_key.p8 | base64"\n )\n\n pkb = p_key.private_bytes(\n encoding=serialization.Encoding.DER,\n format=serialization.PrivateFormat.PKCS8,\n encryption_algorithm=serialization.NoEncryption(),\n )\n\n return pkb\n\n @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__\n if raw_conn=True.\n\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op\n def get_query_status(snowflake: SnowflakeResource, query_id):\n with snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL\n from sqlalchemy import create_engine\n\n engine = create_engine(\n URL(**self._sqlalchemy_connection_args), connect_args=self._sqlalchemy_engine_args\n )\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self._connection_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()\n\n def get_object_to_set_on_execution_context(self) -> Any:\n # Directly create a SnowflakeConnection here for backcompat since the SnowflakeConnection\n # has methods this resource does not have\n return SnowflakeConnection(\n config=self._resolved_config_dict,\n log=get_dagster_logger(),\n snowflake_connection_resource=self,\n )
\n\n\n
[docs]class SnowflakeConnection:\n """A connection to Snowflake that can execute queries. In general this class should not be\n directly instantiated, but rather used as a resource in an op or asset via the\n :py:func:`snowflake_resource`.\n\n Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\n not use this SnowflakeConnection class.\n """\n\n def __init__(\n self, config: Mapping[str, str], log, snowflake_connection_resource: SnowflakeResource\n ):\n self.snowflake_connection_resource = snowflake_connection_resource\n self.log = log\n\n
[docs] @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\n you do not need to create a connection using this context manager.\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op(\n required_resource_keys={"snowflake"}\n )\n def get_query_status(query_id):\n with context.resources.snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n with self.snowflake_connection_resource.get_connection(raw_conn=raw_conn) as conn:\n yield conn
\n\n
[docs] @public\n def execute_query(\n self,\n sql: str,\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ):\n """Execute a query in Snowflake.\n\n Args:\n sql (str): the query to be executed\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to the query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the result of the query. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as a Pandas DataFrame.\n use_pandas_result (bool): If True, will return the result of the query as a Pandas DataFrame.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The result of the query if fetch_results or use_pandas_result is True, otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def drop_database(snowflake: SnowflakeResource):\n snowflake.execute_query(\n "DROP DATABASE IF EXISTS MY_DATABASE"\n )\n """\n check.str_param(sql, "sql")\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n return cursor.fetch_pandas_all()\n if fetch_results:\n return cursor.fetchall()
\n\n
[docs] @public\n def execute_queries(\n self,\n sql_queries: Sequence[str],\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ) -> Optional[Sequence[Any]]:\n """Execute multiple queries in Snowflake.\n\n Args:\n sql_queries (str): List of queries to be executed in series\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to every query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the results of the queries as a list. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as Pandas DataFrames.\n use_pandas_result (bool): If True, will return the results of the queries as a list of a Pandas DataFrames.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The results of the queries as a list if fetch_results or use_pandas_result is True,\n otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def create_fresh_database(snowflake: SnowflakeResource):\n queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n snowflake.execute_queries(\n sql_queries=queries\n )\n\n """\n check.sequence_param(sql_queries, "sql_queries", of_type=str)\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n results: List[Any] = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for raw_sql in sql_queries:\n sql = raw_sql.encode("utf-8") if sys.version_info[0] < 3 else raw_sql\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n results = results.append(cursor.fetch_pandas_all()) # type: ignore\n elif fetch_results:\n results.append(cursor.fetchall())\n\n return results if len(results) > 0 else None
\n\n
[docs] @public\n def load_table_from_local_parquet(self, src: str, table: str):\n """Stores the content of a parquet file to a Snowflake table.\n\n Args:\n src (str): the name of the file to store in Snowflake\n table (str): the name of the table to store the data. If the table does not exist, it will\n be created. Otherwise the contents of the table will be replaced with the data in src\n\n Examples:\n .. code-block:: python\n\n import pandas as pd\n import pyarrow as pa\n import pyarrow.parquet as pq\n\n @op\n def write_parquet_file(snowflake: SnowflakeResource):\n df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n table = pa.Table.from_pandas(df)\n pq.write_table(table, "example.parquet')\n snowflake.load_table_from_local_parquet(\n src="example.parquet",\n table="MY_TABLE"\n )\n\n """\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n f"CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);",\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n f"PUT {src} @%{table};",\n f"COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');",\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SnowflakeResource.to_config_schema(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context) -> SnowflakeConnection:\n """A resource for connecting to the Snowflake data warehouse. The returned resource object is an\n instance of :py:class:`SnowflakeConnection`.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n """\n snowflake_resource = SnowflakeResource.from_resource_context(context)\n return SnowflakeConnection(\n config=context, log=context.log, snowflake_connection_resource=snowflake_resource\n )
\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.resources"}, "snowflake_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.snowflake_io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom pydantic import Field\nfrom sqlalchemy.exc import ProgrammingError\n\nfrom .resources import SnowflakeResource\n\nSNOWFLAKE_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_snowflake_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=SnowflakeIOManager.to_config_schema())\n def snowflake_io_manager(init_context):\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return snowflake_io_manager
\n\n\n
[docs]class SnowflakeIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Name of the database to use.")\n account: str = Field(\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n user: str = Field(description="User login name.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n password: Optional[str] = Field(default=None, description="User password.")\n warehouse: Optional[str] = Field(default=None, description="Name of the warehouse to use.")\n role: Optional[str] = Field(default=None, description="Name of the role to use.")\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details. To"\n " avoid issues with newlines in the keys, you can base64 encode the key. You can"\n " retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64"\n ),\n )\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Path to the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n ),\n )\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "The password of the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both private_key and private_key_path if the private key is encrypted."\n " For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n store_timestamps_as_strings: bool = Field(\n default=False,\n description=(\n "If using Pandas DataFrames, whether to convert time data to strings. If True, time"\n " data will be converted to strings when storing the DataFrame and converted back to"\n " time data when loading the DataFrame. If False, time data without a timezone will be"\n " set to UTC timezone to avoid a Snowflake bug. Defaults to False."\n ),\n )\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n """type_handlers should return a list of the TypeHandlers that the I/O manager can use.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n """\n ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n """If an asset or op is not annotated with an return type, default_load_type will be used to\n determine which TypeHandler to use to store and load the output.\n\n If left unimplemented, default_load_type will return None. In that case, if there is only\n one TypeHandler, the I/O manager will default to loading unannotated outputs with that\n TypeHandler.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n import pandas as pd\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame\n """\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )
\n\n\nclass SnowflakeDbClient(DbClient):\n @staticmethod\n @contextmanager\n def connect(context, table_slice):\n no_schema_config = (\n {k: v for k, v in context.resource_config.items() if k != "schema"}\n if context.resource_config\n else {}\n )\n with SnowflakeResource(\n schema=table_slice.schema, connector="sqlalchemy", **no_schema_config\n ).get_connection(raw_conn=False) as conn:\n yield conn\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n schemas = connection.execute(\n f"show schemas like '{table_slice.schema}' in database {table_slice.database}"\n ).fetchall()\n if len(schemas) == 0:\n connection.execute(f"create schema {table_slice.schema};")\n\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except ProgrammingError:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"""\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n # Snowflake BETWEEN is inclusive; start <= partition expr <= end. We don't want to remove the next partition so we instead\n # write this as start <= partition expr < end.\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_snowflake/snowflake_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.snowflake_io_manager"}}, "dagster_snowflake_pandas": {"snowflake_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pandas.snowflake_pandas_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport pandas as pd\nimport pandas.core.dtypes.common as pd_core_dtypes_common\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient, SnowflakeIOManager\nfrom snowflake.connector.pandas_tools import pd_writer\n\n\ndef _table_exists(table_slice: TableSlice, connection):\n    tables = connection.execute(\n        f"SHOW TABLES LIKE '{table_slice.table}' IN SCHEMA"\n        f" {table_slice.database}.{table_slice.schema}"\n    ).fetchall()\n    return len(tables) > 0\n\n\ndef _get_table_column_types(table_slice: TableSlice, connection) -> Optional[Mapping[str, str]]:\n    if _table_exists(table_slice, connection):\n        schema_list = connection.execute(f"DESCRIBE TABLE {table_slice.table}").fetchall()\n        return {item[0]: item[1] for item in schema_list}\n\n\ndef _convert_timestamp_to_string(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    """Converts columns of data of type pd.Timestamp to string so that it can be stored in\n    snowflake.\n    """\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" not in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    "Snowflake I/O manager: Snowflake I/O manager configured to convert time data"\n                    f" in DataFrame column {column_name} to strings, but the corresponding"\n                    f" {column_name.upper()} column in table {table_name} is not of type VARCHAR,"\n                    f" it is of type {column_types[column_name]}. Please set"\n                    " store_timestamps_as_strings=False in the Snowflake I/O manager configuration"\n                    " to store time data as TIMESTAMP types."\n                )\n        return s.dt.strftime("%Y-%m-%d %H:%M:%S.%f %z")\n    else:\n        return s\n\n\ndef _convert_string_to_timestamp(s: pd.Series) -> pd.Series:\n    """Converts columns of strings in Timestamp format to pd.Timestamp to undo the conversion in\n    _convert_timestamp_to_string.\n\n    This will not convert non-timestamp strings into timestamps (pd.to_datetime will raise an\n    exception if the string cannot be converted)\n    """\n    if isinstance(s[0], str):\n        try:\n            return pd.to_datetime(s.values)  # type: ignore  # (bad stubs)\n        except ValueError:\n            return s\n    else:\n        return s\n\n\ndef _add_missing_timezone(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    f"Snowflake I/O manager: The Snowflake column {column_name.upper()} in table"\n                    f" {table_name} is of type {column_types[column_name]} and should be of type"\n                    f" TIMESTAMP to store the time data in dataframe column {column_name}. Please"\n                    " migrate this column to be of time TIMESTAMP_NTZ(9) to store time data."\n                )\n        return s.dt.tz_localize("UTC")\n    return s\n\n\n
[docs]class SnowflakePandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ) -> Mapping[str, RawMetadataValue]:\n from snowflake import connector\n\n connector.paramstyle = "pyformat"\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n column_types = _get_table_column_types(table_slice, connection)\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _convert_timestamp_to_string(x, column_types, table_slice.table),\n axis="index",\n )\n else:\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _add_missing_timezone(x, column_types, table_slice.table), axis="index"\n )\n with_uppercase_cols.to_sql(\n table_slice.table,\n con=connection.engine,\n if_exists="append",\n index=False,\n method=pd_writer,\n )\n\n return {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=str(name), type=str(dtype))\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = pd.read_sql(\n sql=SnowflakeDbClient.get_select_statement(table_slice), con=connection\n )\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n result = result.apply(_convert_string_to_timestamp, axis="index")\n result.columns = map(str.lower, result.columns) # type: ignore # (bad stubs)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nsnowflake_pandas_io_manager = build_snowflake_io_manager(\n [SnowflakePandasTypeHandler()], default_load_type=pd.DataFrame\n)\nsnowflake_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pandas import snowflake_pandas_io_manager\n from dagster import asset, Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pandas_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePandasIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\n using the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pandas import SnowflakePandasIOManager\n from dagster import asset, Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pandas/snowflake_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pandas.snowflake_pandas_type_handler"}}, "dagster_snowflake_pyspark": {"snowflake_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pyspark.snowflake_pyspark_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\nSNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"\n\n\ndef _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]:\n    check.invariant(\n        config.get("warehouse", None) is not None,\n        "Missing config: Warehouse is required when using PySpark with the Snowflake I/O manager.",\n    )\n\n    conf = {\n        "sfURL": f"{config['account']}.snowflakecomputing.com",\n        "sfUser": config["user"],\n        "sfPassword": config["password"],\n        "sfDatabase": config["database"],\n        "sfSchema": table_slice.schema,\n        "sfWarehouse": config["warehouse"],\n    }\n\n    return conf\n\n\n
[docs]class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(\n "dbtable", table_slice.table\n ).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format(SNOWFLAKE_CONNECTOR)\n .options(**options)\n .option("query", SnowflakeDbClient.get_select_statement(table_slice))\n .load()\n )\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nsnowflake_pyspark_io_manager = build_snowflake_io_manager(\n [SnowflakePySparkTypeHandler()], default_load_type=DataFrame\n)\nsnowflake_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\n from pyspark.sql import DataFrame\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pyspark_io_manager.configured({\n "database": "my_database",\n "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n "password": {"env": "SNOWFLAKE_PASSWORD"},\n ...\n })\n }\n )\n\n Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePySparkIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\n using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pyspark import SnowflakePySparkIOManager\n from pyspark.sql import DataFrame\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePySparkIOManager(\n database="my_database",\n warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n password=EnvVar("SNOWFLAKE_PASSWORD"),\n ...\n )\n }\n )\n\n Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pyspark/snowflake_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pyspark.snowflake_pyspark_type_handler"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration.\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description=(\n "The path to your spark installation. Defaults to $SPARK_HOME at runtime if not"\n " provided."\n ),\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.ops

\nfrom dagster import (\n    In,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\n\nfrom .configs import define_spark_config\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @op(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n ins={"start": In(Nothing)},\n out=Out(Nothing),\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_op(context):\n context.resources.spark.run_spark_job(context.op_config, main_class)\n\n return spark_op
\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                f"Application jar {application_jar} does not exist. A valid jar must be "\n                "built before running this op."\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@dagster_maintained_resource\n@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\n
[docs]class SparkOpError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nimport dagster._check as check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """Convert spark conf dict to list of CLI arguments.\n\n    For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n [f"{spark_home}/bin/spark-submit", "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom dagster import (\n    BoolSource,\n    Field,\n    IntSource,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils import mkdir_p\nfrom dagster._utils.merger import merge_dicts\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\nclass SSHResource:\n    """Resource for ssh remote execution using Paramiko.\n\n    ref: https://github.com/paramiko/paramiko\n    """\n\n    def __init__(\n        self,\n        remote_host,\n        remote_port,\n        username=None,\n        password=None,\n        key_file=None,\n        key_string=None,\n        timeout=10,\n        keepalive_interval=30,\n        compress=True,\n        no_host_key_check=True,\n        allow_host_key_change=False,\n        logger=None,\n    ):\n        self.remote_host = check.str_param(remote_host, "remote_host")\n        self.remote_port = check.opt_int_param(remote_port, "remote_port")\n        self.username = check.opt_str_param(username, "username")\n        self.password = check.opt_str_param(password, "password")\n        self.key_file = check.opt_str_param(key_file, "key_file")\n        self.timeout = check.opt_int_param(timeout, "timeout")\n        self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n        self.compress = check.opt_bool_param(compress, "compress")\n        self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n        self.log = logger\n\n        self.host_proxy = None\n\n        # Create RSAKey object from private key string\n        self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n        # Auto detecting username values from system\n        if not self.username:\n            logger.debug(\n                "username to ssh to host: %s is not specified. Using system's default provided by"\n                " getpass.getuser()"\n                % self.remote_host\n            )\n            self.username = getpass.getuser()\n\n        user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n        if os.path.isfile(user_ssh_config_filename):\n            ssh_conf = paramiko.SSHConfig()\n            ssh_conf.parse(open(user_ssh_config_filename, encoding="utf8"))\n            host_info = ssh_conf.lookup(self.remote_host)\n            if host_info and host_info.get("proxycommand"):\n                self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n            if not (self.password or self.key_file):\n                if host_info and host_info.get("identityfile"):\n                    self.key_file = host_info.get("identityfile")[0]\n\n    def get_connection(self):\n        """Opens a SSH connection to the remote host.\n\n        :rtype: paramiko.client.SSHClient\n        """\n        client = paramiko.SSHClient()\n        client.load_system_host_keys()\n        if self.no_host_key_check:\n            self.log.warning(\n                "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n            )\n            # Default is RejectPolicy\n            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n        if self.password and self.password.strip():\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                password=self.password,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n                look_for_keys=False,\n            )\n        else:\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n            )\n\n        if self.keepalive_interval:\n            client.get_transport().set_keepalive(self.keepalive_interval)\n\n        return client\n\n    def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n        check.int_param(remote_port, "remote_port")\n        check.str_param(remote_host, "remote_host")\n        check.opt_int_param(local_port, "local_port")\n\n        if local_port is not None:\n            local_bind_address = ("localhost", local_port)\n        else:\n            local_bind_address = ("localhost",)\n\n        # Will prefer key string if specified, otherwise use the key file\n        pkey = self.key_obj if self.key_obj else self.key_file\n\n        if self.password and self.password.strip():\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_password=self.password,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                logger=self.log,\n            )\n        else:\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                host_pkey_directories=[],\n                logger=self.log,\n            )\n\n        return client\n\n    def sftp_get(self, remote_filepath, local_filepath):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            local_folder = os.path.dirname(local_filepath)\n\n            # Create intermediate directories if they don't exist\n            mkdir_p(local_folder)\n\n            self.log.info(f"Starting to transfer from {remote_filepath} to {local_filepath}")\n\n            sftp_client.get(remote_filepath, local_filepath)\n\n        conn.close()\n        return local_filepath\n\n    def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            self.log.info(f"Starting to transfer file from {local_filepath} to {remote_filepath}")\n\n            sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n        conn.close()\n        return local_filepath\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n IntSource,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n IntSource,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n IntSource,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(BoolSource, is_required=False, default_value=True),\n "no_host_key_check": Field(BoolSource, is_required=False, default_value=True),\n "allow_host_key_change": Field(\n BoolSource, description="[Deprecated]", is_required=False, default_value=False\n ),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_twilio.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom pydantic import Field\nfrom twilio.rest import Client\n\n\n
[docs]class TwilioResource(ConfigurableResource):\n """This resource is for connecting to Twilio."""\n\n account_sid: str = Field(\n description=(\n "Twilio Account SID, created with yout Twilio account. This can be found on your Twilio"\n " dashboard, see"\n " https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n auth_token: str = Field(\n description=(\n "Twilio Authentication Token, created with yout Twilio account. This can be found on"\n " your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_client(self) -> Client:\n return Client(self.account_sid, self.auth_token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=TwilioResource.to_config_schema(),\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context: InitResourceContext) -> Client:\n return TwilioResource.from_resource_context(context).create_client()
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_twilio.resources"}}, "dagster_wandb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.io_manager

\nimport datetime\nimport os\nimport pickle\nimport platform\nimport shutil\nimport sys\nimport time\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import List, Optional\n\nfrom dagster import (\n    Field,\n    InitResourceContext,\n    InputContext,\n    Int,\n    IOManager,\n    MetadataValue,\n    OutputContext,\n    String,\n    io_manager,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom wandb import Artifact\nfrom wandb.data_types import WBValue\n\nfrom .resources import WANDB_CLOUD_HOST\nfrom .utils.errors import (\n    WandbArtifactsIOManagerError,\n    raise_on_empty_configuration,\n    raise_on_unknown_partition_keys,\n    raise_on_unknown_read_configuration_keys,\n    raise_on_unknown_write_configuration_keys,\n)\nfrom .utils.pickling import (\n    ACCEPTED_SERIALIZATION_MODULES,\n    pickle_artifact_content,\n    unpickle_artifact_content,\n)\nfrom .version import __version__\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\n\nclass Config(TypedDict):\n    dagster_run_id: str\n    wandb_host: str\n    wandb_entity: str\n    wandb_project: str\n    wandb_run_name: Optional[str]\n    wandb_run_id: Optional[str]\n    wandb_run_tags: Optional[List[str]]\n    base_dir: str\n    cache_duration_in_minutes: Optional[int]\n\n\nclass ArtifactsIOManager(IOManager):\n    """IO Manager to handle Artifacts in Weights & Biases (W&B) .\n\n    It handles 3 different inputs:\n    - Pickable objects (the serialization module is configurable)\n    - W&B Objects (Audio, Table, Image, etc)\n    - W&B Artifacts\n    """\n\n    def __init__(self, wandb_client, config: Config):\n        self.wandb = wandb_client\n\n        dagster_run_id = config["dagster_run_id"]\n        self.dagster_run_id = dagster_run_id\n        self.wandb_host = config["wandb_host"]\n        self.wandb_entity = config["wandb_entity"]\n        self.wandb_project = config["wandb_project"]\n        self.wandb_run_id = config.get("wandb_run_id") or dagster_run_id\n        self.wandb_run_name = config.get("wandb_run_name") or f"dagster-run-{dagster_run_id[0:8]}"\n        # augments the run tags\n        wandb_run_tags = config["wandb_run_tags"] or []\n        if "dagster_wandb" not in wandb_run_tags:\n            wandb_run_tags = [*wandb_run_tags, "dagster_wandb"]\n        self.wandb_run_tags = wandb_run_tags\n\n        self.base_dir = config["base_dir"]\n        cache_duration_in_minutes = config["cache_duration_in_minutes"]\n        default_cache_expiration_in_minutes = 60 * 24 * 30  # 60 minutes * 24 hours * 30 days\n        self.cache_duration_in_minutes = (\n            cache_duration_in_minutes\n            if cache_duration_in_minutes is not None\n            else default_cache_expiration_in_minutes\n        )\n\n    def _get_local_storage_path(self):\n        path = self.base_dir\n        if os.path.basename(path) != "storage":\n            path = os.path.join(path, "storage")\n        path = os.path.join(path, "wandb_artifacts_manager")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_artifacts_path(self, name, version):\n        local_storage_path = self._get_local_storage_path()\n        path = os.path.join(local_storage_path, "artifacts", f"{name}.{version}")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_wandb_logs_path(self):\n        local_storage_path = self._get_local_storage_path()\n        # Adding a random uuid to avoid collisions in multi-process context\n        path = os.path.join(local_storage_path, "runs", self.dagster_run_id, str(uuid.uuid4()))\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _clean_local_storage_path(self):\n        local_storage_path = self._get_local_storage_path()\n        cache_duration_in_minutes = self.cache_duration_in_minutes\n        current_timestamp = int(time.time())\n        expiration_timestamp = current_timestamp - (\n            cache_duration_in_minutes * 60  # convert to seconds\n        )\n\n        for root, dirs, files in os.walk(local_storage_path, topdown=False):\n            for name in files:\n                current_file_path = os.path.join(root, name)\n                most_recent_access = os.lstat(current_file_path).st_atime\n                if most_recent_access <= expiration_timestamp or cache_duration_in_minutes == 0:\n                    os.remove(current_file_path)\n            for name in dirs:\n                current_dir_path = os.path.join(root, name)\n                if not os.path.islink(current_dir_path):\n                    if len(os.listdir(current_dir_path)) == 0 or cache_duration_in_minutes == 0:\n                        shutil.rmtree(current_dir_path)\n\n    @contextmanager\n    def wandb_run(self):\n        self.wandb.init(\n            id=self.wandb_run_id,\n            name=self.wandb_run_name,\n            project=self.wandb_project,\n            entity=self.wandb_entity,\n            dir=self._get_wandb_logs_path(),\n            tags=self.wandb_run_tags,\n            anonymous="never",\n            resume="allow",\n        )\n        try:\n            yield self.wandb.run\n        finally:\n            self.wandb.finish()\n            self._clean_local_storage_path()\n\n    def _upload_artifact(self, context: OutputContext, obj):\n        if not context.has_partition_key and context.has_asset_partitions:\n            raise WandbArtifactsIOManagerError(\n                "Sorry, but the Weights & Biases (W&B) IO Manager can't handle processing several"\n                " partitions at the same time within a single run. Please process each partition"\n                " separately. If you think this might be an error, don't hesitate to reach out to"\n                " Weights & Biases Support."\n            )\n\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_write_configuration_keys(parameters)\n\n            serialization_module = parameters.get("serialization_module", {})\n            serialization_module_name = serialization_module.get("name", "pickle")\n\n            if serialization_module_name not in ACCEPTED_SERIALIZATION_MODULES:\n                raise WandbArtifactsIOManagerError(\n                    f"Oops! It looks like the value you provided, '{serialization_module_name}',"\n                    " isn't recognized as a valid serialization module. Here are the ones we do"\n                    f" support: {ACCEPTED_SERIALIZATION_MODULES}."\n                )\n\n            serialization_module_parameters = serialization_module.get("parameters", {})\n            serialization_module_parameters_with_protocol = {\n                "protocol": (\n                    pickle.HIGHEST_PROTOCOL\n                ),  # we use the highest available protocol if we don't pass one\n                **serialization_module_parameters,\n            }\n\n            artifact_type = parameters.get("type", "artifact")\n            artifact_description = parameters.get("description")\n            artifact_metadata = {\n                "source_integration": "dagster_wandb",\n                "source_integration_version": __version__,\n                "source_dagster_run_id": self.dagster_run_id,\n                "source_created_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),\n                "source_python_version": platform.python_version(),\n            }\n            if isinstance(obj, Artifact):\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'name' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'name' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if parameters.get("type") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'type' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'type' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if obj.name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The Weights & Biases (W&B) Artifact you provided is missing a name."\n                        " Please, assign a name to your Artifact."\n                    )\n\n                if context.has_asset_key and obj.name != context.get_asset_identifier()[0]:\n                    asset_identifier = context.get_asset_identifier()[0]\n                    context.log.warning(\n                        f"Please note, the name '{obj.name}' of your Artifact is overwritten by the"\n                        f" name derived from the AssetKey '{asset_identifier}'. For consistency and"\n                        " to avoid confusion, we advise sharing a constant for both your asset's"\n                        " name and the artifact's name."\n                    )\n                    obj._name = asset_identifier  # noqa: SLF001\n\n                if context.has_partition_key:\n                    artifact_name = f"{obj.name}.{context.partition_key}"\n                    # The Artifact provided is produced in a partitioned execution we add the\n                    # partition as a suffix to the Artifact name\n                    obj._name = artifact_name  # noqa: SLF001\n\n                if len(serialization_module) != 0:  # not an empty dict\n                    context.log.warning(\n                        "You've included a 'serialization_module' in the"\n                        " 'wandb_artifact_configuration' settings. However, this doesn't have any"\n                        " impact when the output is already an Artifact object."\n                    )\n\n                # The obj is already an Artifact we augment its metadata\n                artifact = obj\n\n                artifact.metadata = {**artifact.metadata, **artifact_metadata}\n\n                if artifact.description is not None and artifact_description is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a 'description' in the 'wandb_artifact_configuration'"\n                        " settings for an existing Artifact that already has a description. Please,"\n                        " either set the description using 'wandb_artifact_argument' or when"\n                        " creating your Artifact."\n                    )\n                if artifact_description is not None:\n                    artifact.description = artifact_description\n            else:\n                if context.has_asset_key:\n                    if parameters.get("name") is not None:\n                        raise WandbArtifactsIOManagerError(\n                            "You've included a 'name' property in the"\n                            " 'wandb_artifact_configuration' settings. But, a 'name' is only needed"\n                            " when there's no 'AssetKey'. When an Artifact is created from an"\n                            " @asset, it uses the asset name. When it's created from an @op with an"\n                            " 'asset_key' for the output, that value is used. Please remove the"\n                            " 'name' property."\n                        )\n                    artifact_name = context.get_asset_identifier()[0]  # name of asset\n                else:\n                    name_parameter = parameters.get("name")\n                    if name_parameter is None:\n                        raise WandbArtifactsIOManagerError(\n                            "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                            " settings. For Artifacts created from an @op, a 'name' property is"\n                            " needed. You could also use an @asset as an alternative."\n                        )\n                    assert name_parameter is not None\n                    artifact_name = name_parameter\n\n                if context.has_partition_key:\n                    artifact_name = f"{artifact_name}.{context.partition_key}"\n\n                # We replace the | character with - because it is not allowed in artifact names\n                # The | character is used in multi-dimensional partition keys\n                artifact_name = str(artifact_name).replace("|", "-")\n\n                # Creates an artifact to hold the obj\n                artifact = self.wandb.Artifact(\n                    name=artifact_name,\n                    type=artifact_type,\n                    description=artifact_description,\n                    metadata=artifact_metadata,\n                )\n                if isinstance(obj, WBValue):\n                    if len(serialization_module) != 0:  # not an empty dict\n                        context.log.warning(\n                            "You've included a 'serialization_module' in the"\n                            " 'wandb_artifact_configuration' settings. However, this doesn't have"\n                            " any impact when the output is already an W&B object like e.g Table or"\n                            " Image."\n                        )\n                    # Adds the WBValue object using the class name as the name for the file\n                    artifact.add(obj, obj.__class__.__name__)\n                elif obj is not None:\n                    # The output is not a native wandb Object, we serialize it\n                    pickle_artifact_content(\n                        context,\n                        serialization_module_name,\n                        serialization_module_parameters_with_protocol,\n                        artifact,\n                        obj,\n                    )\n\n            # Add any files: https://docs.wandb.ai/ref/python/artifact#add_file\n            add_files = parameters.get("add_files")\n            if add_files is not None and len(add_files) > 0:\n                for add_file in add_files:\n                    artifact.add_file(**add_file)\n\n            # Add any dirs: https://docs.wandb.ai/ref/python/artifact#add_dir\n            add_dirs = parameters.get("add_dirs")\n            if add_dirs is not None and len(add_dirs) > 0:\n                for add_dir in add_dirs:\n                    artifact.add_dir(**add_dir)\n\n            # Add any reference: https://docs.wandb.ai/ref/python/artifact#add_reference\n            add_references = parameters.get("add_references")\n            if add_references is not None and len(add_references) > 0:\n                for add_reference in add_references:\n                    artifact.add_reference(**add_reference)\n\n            # Augments the aliases\n            aliases = parameters.get("aliases", [])\n            aliases.append(f"dagster-run-{self.dagster_run_id[0:8]}")\n            if "latest" not in aliases:\n                aliases.append("latest")\n\n            # Logs the artifact\n            self.wandb.log_artifact(artifact, aliases=aliases)\n            artifact.wait()\n\n            # Adds useful metadata to the output or Asset\n            artifacts_base_url = (\n                "https://wandb.ai"\n                if self.wandb_host == WANDB_CLOUD_HOST\n                else self.wandb_host.rstrip("/")\n            )\n            assert artifact.id is not None\n            output_metadata = {\n                "dagster_run_id": MetadataValue.dagster_run(self.dagster_run_id),\n                "wandb_artifact_id": MetadataValue.text(artifact.id),\n                "wandb_artifact_type": MetadataValue.text(artifact.type),\n                "wandb_artifact_version": MetadataValue.text(artifact.version),\n                "wandb_artifact_size": MetadataValue.int(artifact.size),\n                "wandb_artifact_url": MetadataValue.url(\n                    f"{artifacts_base_url}/{run.entity}/{run.project}/artifacts/{artifact.type}/{'/'.join(artifact.name.rsplit(':', 1))}"\n                ),\n                "wandb_entity": MetadataValue.text(run.entity),\n                "wandb_project": MetadataValue.text(run.project),\n                "wandb_run_id": MetadataValue.text(run.id),\n                "wandb_run_name": MetadataValue.text(run.name),\n                "wandb_run_path": MetadataValue.text(run.path),\n                "wandb_run_url": MetadataValue.url(run.url),\n            }\n            context.add_output_metadata(output_metadata)\n\n    def _download_artifact(self, context: InputContext):\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_read_configuration_keys(parameters)\n\n            partitions_configuration = parameters.get("partitions", {})\n\n            if not context.has_asset_partitions and len(partitions_configuration) > 0:\n                raise WandbArtifactsIOManagerError(\n                    "You've included a 'partitions' value in the 'wandb_artifact_configuration'"\n                    " settings but it's not within a partitioned execution. Please only use"\n                    " 'partitions' within a partitioned context."\n                )\n\n            if context.has_asset_partitions:\n                # Note: this is currently impossible to unit test with current Dagster APIs but was\n                # tested thoroughly manually\n                name = parameters.get("get")\n                path = parameters.get("get_path")\n                if name is not None or path is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a value for 'get' and/or 'get_path' in the"\n                        " 'wandb_artifact_configuration' settings during a partitioned execution."\n                        " Please use the 'partitions' property to set 'get' or 'get_path' for each"\n                        " individual partition. To set a default value for all partitions, use '*'."\n                    )\n\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    artifact_name = context.asset_key[0][0]  # name of asset\n\n                partitions = [\n                    (key, f"{artifact_name}.{ str(key).replace('|', '-')}")\n                    for key in context.asset_partition_keys\n                ]\n\n                output = {}\n\n                for key, artifact_name in partitions:\n                    context.log.info(f"Handling partition with key '{key}'")\n                    partition_configuration = partitions_configuration.get(\n                        key, partitions_configuration.get("*")\n                    )\n\n                    raise_on_empty_configuration(key, partition_configuration)\n                    raise_on_unknown_partition_keys(key, partition_configuration)\n\n                    partition_version = None\n                    partition_alias = None\n                    if partition_configuration and partition_configuration is not None:\n                        partition_version = partition_configuration.get("version")\n                        partition_alias = partition_configuration.get("alias")\n                        if partition_version is not None and partition_alias is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'version' and 'alias' for the partition with"\n                                " key '{key}'. You should only use one of these properties at a"\n                                " time. If you choose not to use any, the latest version will be"\n                                " used by default. If this partition is configured with the '*'"\n                                " key, please correct the wildcard configuration."\n                            )\n                    partition_identifier = partition_version or partition_alias or "latest"\n\n                    artifact_uri = (\n                        f"{run.entity}/{run.project}/{artifact_name}:{partition_identifier}"\n                    )\n                    try:\n                        api = self.wandb.Api()\n                        api.artifact(artifact_uri)\n                    except Exception as exception:\n                        raise WandbArtifactsIOManagerError(\n                            "The artifact you're attempting to download might not exist, or you"\n                            " might have forgotten to include the 'name' property in the"\n                            " 'wandb_artifact_configuration' settings."\n                        ) from exception\n\n                    artifact = run.use_artifact(artifact_uri)\n\n                    artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n                    if partition_configuration and partition_configuration is not None:\n                        partition_name = partition_configuration.get("get")\n                        partition_path = partition_configuration.get("get_path")\n                        if partition_name is not None and partition_path is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'get' and 'get_path' in the"\n                                " 'wandb_artifact_configuration' settings for the partition with"\n                                " key '{key}'. Only one of these properties should be used. If you"\n                                " choose not to use any, the whole Artifact will be returned. If"\n                                " this partition is configured with the '*' key, please correct the"\n                                " wildcard configuration."\n                            )\n\n                        if partition_name is not None:\n                            wandb_object = artifact.get(partition_name)\n                            if wandb_object is not None:\n                                output[key] = wandb_object\n                                continue\n\n                        if partition_path is not None:\n                            path = artifact.get_path(partition_path)\n                            download_path = path.download(root=artifacts_path)\n                            if download_path is not None:\n                                output[key] = download_path\n                                continue\n\n                    artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n                    unpickled_content = unpickle_artifact_content(artifact_dir)\n                    if unpickled_content is not None:\n                        output[key] = unpickled_content\n                        continue\n\n                    artifact.verify(root=artifacts_path)\n                    output[key] = artifact\n\n                if len(output) == 1:\n                    # If there's only one partition, return the value directly\n                    return next(iter(output.values()))\n\n                return output\n\n            elif context.has_asset_key:\n                # Input is an asset\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "A conflict has been detected in the provided configuration settings. The"\n                        " 'name' parameter appears to be specified twice - once in the"\n                        " 'wandb_artifact_configuration' metadata dictionary, and again as an"\n                        " AssetKey. Kindly avoid setting the name directly, since the AssetKey will"\n                        " be used for this purpose."\n                    )\n                artifact_name = context.get_asset_identifier()[0]  # name of asset\n            else:\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                        " settings. For Artifacts used in an @op, a 'name' property is required."\n                        " You could use an @asset as an alternative."\n                    )\n\n            if context.has_partition_key:\n                artifact_name = f"{artifact_name}.{context.partition_key}"\n\n            artifact_alias = parameters.get("alias")\n            artifact_version = parameters.get("version")\n\n            if artifact_alias is not None and artifact_version is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'version' and 'alias' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the latest version will be applied"\n                    " automatically."\n                )\n\n            artifact_identifier = artifact_alias or artifact_version or "latest"\n            artifact_uri = f"{run.entity}/{run.project}/{artifact_name}:{artifact_identifier}"\n\n            # This try/except block is a workaround for a bug in the W&B SDK, this should be removed\n            # once the bug is fixed.\n            try:\n                artifact = run.use_artifact(artifact_uri)\n            except Exception:\n                api = self.wandb.Api()\n                artifact = api.artifact(artifact_uri)\n\n            name = parameters.get("get")\n            path = parameters.get("get_path")\n            if name is not None and path is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'get' and 'get_path' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the entire Artifact will be returned."\n                )\n\n            if name is not None:\n                return artifact.get(name)\n\n            artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n            if path is not None:\n                path = artifact.get_path(path)\n                return path.download(root=artifacts_path)\n\n            artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n\n            unpickled_content = unpickle_artifact_content(artifact_dir)\n            if unpickled_content is not None:\n                return unpickled_content\n\n            artifact.verify(root=artifacts_path)\n            return artifact\n\n    def handle_output(self, context: OutputContext, obj) -> None:\n        if obj is None:\n            context.log.warning(\n                "The output value given to the Weights & Biases (W&B) IO Manager is empty. If this"\n                " was intended, you can disregard this warning."\n            )\n        else:\n            try:\n                self._upload_artifact(context, obj)\n            except WandbArtifactsIOManagerError as exception:\n                raise exception\n            except Exception as exception:\n                raise WandbArtifactsIOManagerError() from exception\n\n    def load_input(self, context: InputContext):\n        try:\n            return self._download_artifact(context)\n        except WandbArtifactsIOManagerError as exception:\n            raise exception\n        except Exception as exception:\n            raise WandbArtifactsIOManagerError() from exception\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n required_resource_keys={"wandb_resource", "wandb_config"},\n description="IO manager to read and write W&B Artifacts",\n config_schema={\n "run_name": Field(\n String,\n is_required=False,\n description=(\n "Short display name for this run, which is how you'll identify this run in the UI."\n " By default, it`s set to a string with the following format dagster-run-[8 first"\n " characters of the Dagster Run ID] e.g. dagster-run-7e4df022."\n ),\n ),\n "run_id": Field(\n String,\n is_required=False,\n description=(\n "Unique ID for this run, used for resuming. It must be unique in the project, and"\n " if you delete a run you can't reuse the ID. Use the name field for a short"\n " descriptive name, or config for saving hyperparameters to compare across runs."\n r" The ID cannot contain the following special characters: /\\#?%:.. You need to set"\n " the Run ID when you are doing experiment tracking inside Dagster to allow the IO"\n " Manager to resume the run. By default it`s set to the Dagster Run ID e.g "\n " 7e4df022-1bf2-44b5-a383-bb852df4077e."\n ),\n ),\n "run_tags": Field(\n [String],\n is_required=False,\n description=(\n "A list of strings, which will populate the list of tags on this run in the UI."\n " Tags are useful for organizing runs together, or applying temporary labels like"\n " 'baseline' or 'production'. It's easy to add and remove tags in the UI, or filter"\n " down to just runs with a specific tag. Any W&B Run used by the integration will"\n " have the dagster_wandb tag."\n ),\n ),\n "base_dir": Field(\n String,\n is_required=False,\n description=(\n "Base directory used for local storage and caching. W&B Artifacts and W&B Run logs"\n " will be written and read from that directory. By default, it`s using the"\n " DAGSTER_HOME directory."\n ),\n ),\n "cache_duration_in_minutes": Field(\n Int,\n is_required=False,\n description=(\n "Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the"\n " local storage. Only files and directories that were not opened for that amount of"\n " time are removed from the cache. Cache purging happens at the end of an IO"\n " Manager execution. You can set it to 0, if you want to disable caching"\n " completely. Caching improves speed when an Artifact is reused between jobs"\n " running on the same machine. It defaults to 30 days."\n ),\n ),\n },\n)\ndef wandb_artifacts_io_manager(context: InitResourceContext):\n """Dagster IO Manager to create and consume W&B Artifacts.\n\n It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n **Example:**\n\n .. code-block:: python\n\n @repository\n def my_repository():\n return [\n *with_resources(\n load_assets_from_current_module(),\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n {"cache_duration_in_minutes": 60} # only cache files for one hour\n ),\n },\n resource_config_by_key={\n "wandb_config": {\n "config": {\n "entity": "my_entity",\n "project": "my_project"\n }\n }\n },\n ),\n ]\n\n\n @asset(\n name="my_artifact",\n metadata={\n "wandb_artifact_configuration": {\n "type": "dataset",\n }\n },\n io_manager_key="wandb_artifacts_manager",\n )\n def create_dataset():\n return [1, 2, 3]\n\n """\n wandb_client = context.resources.wandb_resource["sdk"]\n wandb_host = context.resources.wandb_resource["host"]\n wandb_entity = context.resources.wandb_config["entity"]\n wandb_project = context.resources.wandb_config["project"]\n\n wandb_run_name = None\n wandb_run_id = None\n wandb_run_tags = None\n base_dir = (\n context.instance.storage_directory() if context.instance else os.environ["DAGSTER_HOME"]\n )\n cache_duration_in_minutes = None\n if context.resource_config is not None:\n wandb_run_name = context.resource_config.get("run_name")\n wandb_run_id = context.resource_config.get("run_id")\n wandb_run_tags = context.resource_config.get("run_tags")\n base_dir = context.resource_config.get("base_dir", base_dir)\n cache_duration_in_minutes = context.resource_config.get("cache_duration_in_minutes")\n\n if "PYTEST_CURRENT_TEST" in os.environ:\n dagster_run_id = "unit-testing"\n else:\n dagster_run_id = context.run_id\n\n assert dagster_run_id is not None\n\n config: Config = {\n "dagster_run_id": dagster_run_id,\n "wandb_host": wandb_host,\n "wandb_entity": wandb_entity,\n "wandb_project": wandb_project,\n "wandb_run_name": wandb_run_name,\n "wandb_run_id": wandb_run_id,\n "wandb_run_tags": wandb_run_tags,\n "base_dir": base_dir,\n "cache_duration_in_minutes": cache_duration_in_minutes,\n }\n return ArtifactsIOManager(wandb_client, config)
\n
", "current_page_name": "_modules/dagster_wandb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.io_manager"}, "launch": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.launch.ops

\nfrom dagster import OpExecutionContext, op\nfrom wandb.sdk.launch import launch\nfrom wandb.sdk.launch.launch_add import launch_add\n\nfrom .configs import launch_agent_config, launch_config\n\n\ndef raise_on_invalid_config(context: OpExecutionContext):\n    entity = context.resources.wandb_config["entity"]\n    if entity == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'entity' property of the"\n            " 'wandb_config'."\n        )\n\n    project = context.resources.wandb_config["project"]\n    if project == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'project' property of the"\n            " 'wandb_config'."\n        )\n\n\n
[docs]@op(\n required_resource_keys={"wandb_resource", "wandb_config"},\n config_schema=launch_agent_config(),\n)\ndef run_launch_agent(context: OpExecutionContext):\n """It starts a Launch Agent and runs it as a long running process until stopped manually.\n\n Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\n services to be executed) in order.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n run_launch_agent:\n config:\n max_jobs: -1\n queues:\n - my_dagster_queue\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_agent\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_agent_example():\n run_launch_agent()\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch agent configuration: {config}")\n context.log.info("Running Launch agent...")\n launch.create_and_run_agent(api=context.resources.wandb_resource["api"], config=config)
\n\n\n
[docs]@op(\n required_resource_keys={\n "wandb_resource",\n "wandb_config",\n },\n config_schema=launch_config(),\n)\ndef run_launch_job(context: OpExecutionContext):\n """Executes a Launch job.\n\n A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\n default one. Make sure you have an active agent listening to that queue. You can run an agent\n inside your Dagster instance but can also consider using a deployable agent in Kubernetes.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n my_launched_job:\n config:\n entry_point:\n - python\n - train.py\n queue: my_dagster_queue\n uri: https://github.com/wandb/example-dagster-integration-with-launch\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_job\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_job_example():\n run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch job configuration: {config}")\n\n queue = context.op_config.get("queue")\n if queue is None:\n context.log.info("No queue provided, running Launch job locally")\n launch.run(api=context.resources.wandb_resource["api"], config=config)\n else:\n synchronous = config.get("synchronous", True)\n config.pop("synchronous", None)\n queued_run = launch_add(**config)\n if synchronous is True:\n context.log.info(\n f"Synchronous Launch job added to queue with name={queue}. Waiting for"\n " completion..."\n )\n queued_run.wait_until_finished()\n else:\n context.log.info(f"Asynchronous Launch job added to queue with name={queue}")
\n
", "current_page_name": "_modules/dagster_wandb/launch/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.launch.ops"}}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.resources

\nfrom typing import Any, Dict\n\nimport wandb\nfrom dagster import Field, InitResourceContext, String, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom wandb.sdk.internal.internal_api import Api\n\nWANDB_CLOUD_HOST: str = "https://api.wandb.ai"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n description="W&B API key necessary to communicate with the W&B API.",\n is_required=True,\n ),\n "host": Field(\n String,\n description=(\n "API host server you wish to use. Only required if you are using W&B Server."\n ),\n is_required=False,\n default_value=WANDB_CLOUD_HOST,\n ),\n },\n description="Resource for interacting with Weights & Biases",\n)\ndef wandb_resource(context: InitResourceContext) -> Dict[str, Any]:\n """Dagster resource used to communicate with the W&B API. It's useful when you want to use the\n wandb client within your ops and assets. It's a required resources if you are using the W&B IO\n Manager.\n\n It automatically authenticates using the provided API key.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_wandb import wandb_resource\n\n my_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n @job(resource_defs={"wandb_resource": my_wandb_resource})\n def my_wandb_job():\n ...\n\n """\n api_key = context.resource_config["api_key"]\n host = context.resource_config["host"]\n wandb.login(\n key=api_key,\n host=host,\n anonymous="never",\n )\n client_settings = wandb.Settings(\n api_key=api_key,\n base_url=host,\n anonymous="never",\n launch=True,\n )\n api = Api(default_settings=client_settings, load_settings=False)\n return {"sdk": wandb, "api": api, "host": host}
\n
", "current_page_name": "_modules/dagster_wandb/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.types

\nimport sys\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\nfrom typing import Any, Dict, List\n\n\n
[docs]class SerializationModule(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking."""\n\n name: str\n parameters: Dict[str, Any]
\n\n\n
[docs]class WandbArtifactConfiguration(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration. Useful for type checking."""\n\n name: str\n type: str\n description: str\n aliases: List[str]\n add_dirs: List[Dict[str, Any]]\n add_files: List[Dict[str, Any]]\n add_references: List[Dict[str, Any]]\n serialization_module: SerializationModule\n partitions: Dict[str, Dict[str, Any]]
\n
", "current_page_name": "_modules/dagster_wandb/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.types"}, "utils": {"errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.utils.errors

\n
[docs]class WandbArtifactsIOManagerError(Exception):\n """Represents an execution error of the W&B Artifacts IO Manager."""\n\n def __init__(self, message="A W&B Artifacts IO Manager error occurred."):\n self.message = message\n super().__init__(self.message)
\n\n\nSUPPORTED_READ_CONFIG_KEYS = [\n "alias",\n "get_path",\n "get",\n "name",\n "partitions",\n "version",\n]\nSUPPORTED_WRITE_CONFIG_KEYS = [\n "add_dirs",\n "add_files",\n "add_references",\n "aliases",\n "description",\n "name",\n "partitions",\n "serialization_module",\n "type",\n]\nSUPPORTED_PARTITION_CONFIG_KEYS = ["get", "get_path", "version", "alias"]\n\n\ndef raise_on_empty_configuration(partition_key, dictionary):\n if dictionary is not None and len(dictionary) == 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration is empty for the partition identified by the key '{partition_key}'."\n " This happened within the 'wandb_artifact_configuration' metadata dictionary."\n )\n\n\ndef raise_on_unknown_keys(supported_config_keys, dictionary, is_read_config):\n if dictionary is None:\n return\n\n unsupported_keys = [key for key in dictionary.keys() if key not in supported_config_keys]\n if len(unsupported_keys) > 0:\n if is_read_config:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " reading an Artifact."\n )\n else:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " writing an Artifact."\n )\n\n\ndef raise_on_unknown_write_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_WRITE_CONFIG_KEYS, dictionary, False)\n\n\ndef raise_on_unknown_read_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_READ_CONFIG_KEYS, dictionary, True)\n\n\ndef raise_on_unknown_partition_keys(partition_key, dictionary):\n if dictionary is None:\n return\n\n unsupported_keys = [\n key for key in dictionary.keys() if key not in SUPPORTED_PARTITION_CONFIG_KEYS\n ]\n if len(unsupported_keys) > 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not supported"\n f" for the partition identified by the key '{partition_key}'. This happened within the"\n " 'wandb_artifact_configuration' metadata dictionary."\n )\n
", "current_page_name": "_modules/dagster_wandb/utils/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.utils.errors"}}}, "dagstermill": {"asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.asset_factory

\nimport pickle\nimport tempfile\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Set, Type, Union, cast\n\nimport dagster._check as check\nfrom dagster import (\n    AssetIn,\n    AssetKey,\n    AssetsDefinition,\n    Failure,\n    Output,\n    PartitionsDefinition,\n    ResourceDefinition,\n    RetryPolicy,\n    RetryRequested,\n    SourceAsset,\n    asset,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.type_check_utils import safe_is_subclass\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom dagstermill.factory import _clean_path_for_windows, execute_notebook\n\n\ndef _make_dagstermill_asset_compute_fn(\n    name: str,\n    notebook_path: str,\n    save_notebook_on_failure: bool,\n) -> Callable:\n    def _t_fn(context: OpExecutionContext, **inputs) -> Iterable:\n        check.param_invariant(\n            isinstance(context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                context.get_step_execution_context(),\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            with open(executed_notebook_path, "rb") as fd:\n                yield Output(fd.read())\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_asset(\n name: str,\n notebook_path: str,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n save_notebook_on_failure: bool = False,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> AssetsDefinition:\n """Creates a Dagster asset for a Jupyter notebook.\n\n Arguments:\n name (str): The name for the asset\n notebook_path (str): Path to the backing notebook\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]): The assets\n that are upstream dependencies, but do not pass an input value to the notebook.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the notebook.\n description (Optional[str]): Description of the asset to display in the Dagster UI.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n context within the notebook.\n io_manager_key (Optional[str]): A string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are\n upstream dependencies, but do not pass an input to the asset.\n\n Examples:\n .. code-block:: python\n\n from dagstermill import define_dagstermill_asset\n from dagster import asset, AssetIn, AssetKey\n from sklearn import datasets\n import pandas as pd\n import numpy as np\n\n @asset\n def iris_dataset():\n sk_iris = datasets.load_iris()\n return pd.DataFrame(\n data=np.c_[sk_iris["data"], sk_iris["target"]],\n columns=sk_iris["feature_names"] + ["target"],\n )\n\n iris_kmeans_notebook = define_dagstermill_asset(\n name="iris_kmeans_notebook",\n notebook_path="/path/to/iris_kmeans.ipynb",\n ins={\n "iris": AssetIn(key=AssetKey("iris_dataset"))\n }\n )\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=AssetIn)\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n key_prefix = check.opt_list_param(key_prefix, "key_prefix", of_type=str)\n\n default_description = f"This asset is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n\n user_tags = validate_tags(op_tags)\n if op_tags is not None:\n check.invariant(\n "notebook_path" not in op_tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in op_tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return asset(\n name=name,\n key_prefix=key_prefix,\n ins=ins,\n deps=deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n partitions_def=partitions_def,\n op_tags={**user_tags, **default_tags},\n group_name=group_name,\n output_required=False,\n io_manager_key=io_mgr_key,\n retry_policy=retry_policy,\n non_argument_deps=non_argument_deps,\n )(\n _make_dagstermill_asset_compute_fn(\n name=name,\n notebook_path=notebook_path,\n save_notebook_on_failure=save_notebook_on_failure,\n )\n )
\n
", "current_page_name": "_modules/dagstermill/asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.asset_factory"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.context

\nfrom typing import AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    DagsterRun,\n    JobDefinition,\n    OpDefinition,\n    _check as check,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster._core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._job_context = check.inst_param(job_context, "job_context", PlanExecutionContext)\n self._job_def = check.inst_param(job_def, "job_def", JobDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.op_name = check.str_param(op_name, "op_name")\n self.node_handle = check.inst_param(node_handle, "node_handle", NodeHandle)\n self._op_config = op_config\n\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._job_context.has_tag(key)\n\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._job_context.get_tag(key)\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._job_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, Any]:\n """dict: The run_config for the context."""\n return self._job_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context."""\n return self._job_context.resolved_run_config\n\n @public\n @property\n def logging_tags(self) -> Mapping[str, str]:\n """dict: The logging tags for the context."""\n return self._job_context.logging_tags\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the executing job."""\n return self._job_context.job_name\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """:class:`dagster.JobDefinition`: The job definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._job_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources.\n """\n return self._job_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @public\n @property\n def run(self) -> DagsterRun:\n """:class:`dagster.DagsterRun`: The job run for the context."""\n return cast(DagsterRun, self._job_context.dagster_run)\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._job_context.log\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """:class:`dagster.OpDefinition`: The op definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return cast(OpDefinition, self._job_def.node_def_named(self.op_name))\n\n @property\n def node(self) -> Node:\n """:class:`dagster.Node`: The node for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return self.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n op-specific config.\n """\n if self._op_config:\n return self._op_config\n\n op_config = self.resolved_run_config.ops.get(self.op_name)\n return op_config.config if op_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n step_context: StepExecutionContext,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n job_context,\n job_def,\n resource_keys_to_init,\n op_name,\n node_handle,\n op_config,\n )\n\n @property\n def step_context(self) -> StepExecutionContext:\n return self._step_context\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.errors

\nfrom dagster._core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Set, Type, Union, cast\n\nimport nbformat\nimport papermill\nfrom dagster import (\n    In,\n    OpDefinition,\n    Out,\n    Output,\n    _check as check,\n    _seven,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.type_check_utils import safe_is_subclass\nfrom dagster._core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._serdes import pack_value\nfrom dagster._seven import get_system_temp_directory\nfrom dagster._utils import mkdir_p, safe_tempfile_path\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\ndef _clean_path_for_windows(notebook_path: str) -> str:\n    """In windows, the notebook can't render in the Dagster UI unless the C: prefix is removed.\n    os.path.splitdrive will split the path into (drive, tail), so just return the tail.\n    """\n    return os.path.splitdrive(notebook_path)[1]\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropriate place in the input notebook.\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n            "cell."\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = _seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(\n    step_context: StepExecutionContext,\n    inputs: Mapping[str, object],\n    output_log_path: str,\n    compute_descriptor: str,\n) -> Mapping[str, object]:\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.job, ReconstructableJob):\n        if compute_descriptor == "asset":\n            raise DagstermillError(\n                "Can't execute a dagstermill asset that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.job.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_node_handle_kwargs = step_context.node_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.dagster_run)\n    parameters["__dm_node_handle_kwargs"] = dm_node_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef execute_notebook(\n    step_context: StepExecutionContext,\n    name: str,\n    save_notebook_on_failure: bool,\n    notebook_path: str,\n    output_notebook_dir: str,\n    inputs: Mapping[str, object],\n) -> str:\n    with safe_tempfile_path() as output_log_path:\n        prefix = str(uuid.uuid4())\n        parameterized_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-inter.ipynb")\n\n        executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n        # Scaffold the registration here\n        nb = load_notebook_node(notebook_path)\n        compute_descriptor = "op"\n        nb_no_parameters = replace_parameters(\n            step_context,\n            nb,\n            get_papermill_parameters(\n                step_context,\n                inputs,\n                output_log_path,\n                compute_descriptor,\n            ),\n        )\n        write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n        try:\n            papermill_engines.register("dagstermill", DagstermillEngine)\n            papermill.execute_notebook(\n                input_path=parameterized_notebook_path,\n                output_path=executed_notebook_path,\n                engine_name="dagstermill",\n                log_output=True,\n            )\n\n        except Exception as ex:\n            step_context.log.warn(\n                "Error when attempting to materialize executed notebook: {exc}".format(\n                    exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                )\n            )\n\n            if isinstance(ex, ExecutionError):\n                exception_name = ex.ename  # type: ignore\n                if exception_name in ["RetryRequested", "Failure"]:\n                    step_context.log.warn(\n                        f"Encountered raised {exception_name} in notebook. Use"\n                        " dagstermill.yield_event with RetryRequested or Failure to trigger"\n                        " their behavior."\n                    )\n\n            if save_notebook_on_failure:\n                storage_dir = step_context.instance.storage_directory()\n                storage_path = os.path.join(storage_dir, f"{prefix}-out.ipynb")\n                with open(storage_path, "wb") as dest_file_obj:\n                    with open(executed_notebook_path, "rb") as obj:\n                        dest_file_obj.write(obj.read())\n\n                step_context.log.info(f"Failed notebook written to {storage_path}")\n\n            raise\n\n    step_context.log.debug(f"Notebook execution complete for {name} at {executed_notebook_path}.")\n\n    return executed_notebook_path\n\n\ndef _handle_events_from_notebook(\n    step_context: StepExecutionContext, executed_notebook_path: str\n) -> Iterable:\n    # deferred import for perf\n    import scrapbook\n\n    output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n    for output_name in step_context.op_def.output_dict.keys():\n        data_dict = output_nb.scraps.data_dict\n        if output_name in data_dict:\n            # read outputs that were passed out of process via io manager from `yield_result`\n            step_output_handle = StepOutputHandle(\n                step_key=step_context.step.key,\n                output_name=output_name,\n            )\n            output_context = step_context.get_output_context(step_output_handle)\n            io_manager = step_context.get_io_manager(step_output_handle)\n            value = io_manager.load_input(\n                build_input_context(\n                    upstream_output=output_context, dagster_type=output_context.dagster_type\n                )\n            )\n\n            yield Output(value, output_name)\n\n    for key, value in output_nb.scraps.items():\n        if key.startswith("event-"):\n            with open(value.data, "rb") as fd:\n                event = pickle.loads(fd.read())\n                if isinstance(event, (Failure, RetryRequested)):\n                    raise event\n                else:\n                    yield event\n\n\ndef _make_dagstermill_compute_fn(\n    dagster_factory_name: str,\n    name: str,\n    notebook_path: str,\n    output_notebook_name: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    output_notebook: Optional[str] = None,\n    save_notebook_on_failure: bool = False,\n) -> Callable:\n    def _t_fn(op_context: OpExecutionContext, inputs: Mapping[str, object]) -> Iterable:\n        check.param_invariant(\n            isinstance(op_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_context = op_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                step_context,\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as an op output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = op_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=[*(asset_key_prefix or []), f"{name}_output_notebook"],\n                        description="Location of output notebook in file manager",\n                        metadata={\n                            "path": MetadataValue.path(executed_notebook_materialization_path),\n                        },\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    op_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file"\n                        " manager:"\n                        f" {serializable_error_info_from_exc_info(sys.exc_info())}\\nNow"\n                        " falling back to local: notebook execution was temporarily materialized"\n                        f" at {executed_notebook_path}\\nIf you have supplied a file manager and"\n                        " expect to use it for materializing the notebook, please include"\n                        ' "file_manager" in the `required_resource_keys` argument to'\n                        f" `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            yield from _handle_events_from_notebook(step_context, executed_notebook_path)\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[Sequence[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n io_manager_key: Optional[str] = None,\n save_notebook_on_failure: bool = False,\n) -> OpDefinition:\n """Wrap a Jupyter notebook in a op.\n\n Arguments:\n name (str): The name of the op.\n notebook_path (str): Path to the backing notebook.\n ins (Optional[Mapping[str, In]]): The op's inputs.\n outs (Optional[Mapping[str, Out]]): The op's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream ops to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for op.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate op.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n io_manager_key (Optional[str]): If using output_notebook_name, you can additionally provide\n a string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n\n Returns:\n :py:class:`~dagster.OpDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n outs = check.opt_mapping_param(outs, "outs", key_type=str, value_type=Out)\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=In)\n\n if output_notebook_name is not None:\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n required_resource_keys.add(io_mgr_key)\n outs = {\n **outs,\n cast(str, output_notebook_name): Out(io_manager_key=io_mgr_key),\n }\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return OpDefinition(\n name=name,\n compute_fn=_make_dagstermill_compute_fn(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n save_notebook_on_failure=save_notebook_on_failure,\n ),\n ins=ins,\n outs=outs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    ConfigurableIOManagerFactory,\n    InitResourceContext,\n    IOManager,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager, io_manager\nfrom dagster._utils import mkdir_p\nfrom pydantic import Field\n\nfrom dagstermill.factory import _clean_path_for_windows\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[Sequence[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[Sequence[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        if context.has_asset_key:\n            keys = context.get_asset_identifier()\n        else:\n            keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes."""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n\n        metadata = {\n            "Executed notebook": MetadataValue.notebook(\n                _clean_path_for_windows(output_notebook_path)\n            )\n        }\n\n        if context.has_asset_key:\n            context.add_output_metadata(metadata)\n        else:\n            context.log_event(\n                AssetMaterialization(\n                    asset_key=AssetKey(\n                        [*self.asset_key_prefix, f"{context.step_key}_output_notebook"]\n                    ),\n                    metadata=metadata,\n                )\n            )\n\n    def load_input(self, context: InputContext) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream ops as File Object\n        output_context = check.not_none(context.upstream_output)\n        with open(self._get_path(output_context), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]class ConfigurableLocalOutputNotebookIOManager(ConfigurableIOManagerFactory):\n """Built-in IO Manager for handling output notebook."""\n\n base_dir: Optional[str] = Field(\n default=None,\n description=(\n "Base directory to use for output notebooks. Defaults to the Dagster instance storage"\n " directory if not provided."\n ),\n )\n asset_key_prefix: List[str] = Field(\n default=[],\n description=(\n "Asset key prefix to apply to assets materialized for output notebooks. Defaults to no"\n " prefix."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "LocalOutputNotebookIOManager":\n return LocalOutputNotebookIOManager(\n base_dir=self.base_dir or check.not_none(context.instance).storage_directory(),\n asset_key_prefix=self.asset_key_prefix,\n )
\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema=ConfigurableLocalOutputNotebookIOManager.to_config_schema())\ndef local_output_notebook_io_manager(init_context) -> LocalOutputNotebookIOManager:\n """Built-in IO Manager that handles output notebooks."""\n return ConfigurableLocalOutputNotebookIOManager.from_resource_context(init_context)\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.io_managers"}, "manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.manager

\nimport os\nimport pickle\nimport uuid\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Failure,\n    LoggerDefinition,\n    ResourceDefinition,\n    StepExecutionContext,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._core.definitions.dependency import NodeHandle\nfrom dagster._core.definitions.events import RetryRequested\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, scoped_job_context\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.resources_init import (\n    get_required_resource_keys_to_init,\n    resource_initialization_event_generator,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig, ResourceConfig\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._loggers import colored_console_logger\nfrom dagster._serdes import unpack_value\nfrom dagster._utils import EventGenerationManager\n\nfrom .context import DagstermillExecutionContext, DagstermillRuntimeExecutionContext\nfrom .errors import DagstermillError\nfrom .serialize import PICKLE_PROTOCOL\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n\n\nclass DagstermillResourceEventGenerationManager(EventGenerationManager):\n    """Utility class to explicitly manage setup/teardown of resource events. Overrides the default\n    `generate_teardown_events` method so that teardown is deferred until explicitly called by the\n    dagstermill Manager.\n    """\n\n    def generate_teardown_events(self):\n        return iter(())\n\n    def teardown(self):\n        return [\n            teardown_event\n            for teardown_event in super(\n                DagstermillResourceEventGenerationManager, self\n            ).generate_teardown_events()\n        ]\n\n\nclass Manager:\n    def __init__(self):\n        self.job = None\n        self.op_def: Optional[NodeDefinition] = None\n        self.in_job: bool = False\n        self.marshal_dir: Optional[str] = None\n        self.context = None\n        self.resource_manager = None\n\n    def _setup_resources(\n        self,\n        resource_defs: Mapping[str, ResourceDefinition],\n        resource_configs: Mapping[str, ResourceConfig],\n        log_manager: DagsterLogManager,\n        execution_plan: Optional[ExecutionPlan],\n        dagster_run: Optional[DagsterRun],\n        resource_keys_to_init: Optional[AbstractSet[str]],\n        instance: Optional[DagsterInstance],\n        emit_persistent_events: Optional[bool],\n    ):\n        """Drop-in replacement for\n        `dagster._core.execution.resources_init.resource_initialization_manager`.  It uses a\n        `DagstermillResourceEventGenerationManager` and explicitly calls `teardown` on it.\n        """\n        generator = resource_initialization_event_generator(\n            resource_defs=resource_defs,\n            resource_configs=resource_configs,\n            log_manager=log_manager,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            resource_keys_to_init=resource_keys_to_init,\n            instance=instance,\n            emit_persistent_events=emit_persistent_events,\n        )\n        self.resource_manager = DagstermillResourceEventGenerationManager(\n            generator, ScopedResourcesBuilder\n        )\n        return self.resource_manager\n\n    def reconstitute_job_context(\n        self,\n        executable_dict: Mapping[str, Any],\n        job_run_dict: Mapping[str, Any],\n        node_handle_kwargs: Mapping[str, Any],\n        instance_ref_dict: Mapping[str, Any],\n        step_key: str,\n        output_log_path: Optional[str] = None,\n        marshal_dir: Optional[str] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n    ):\n        """Reconstitutes a context for dagstermill-managed execution.\n\n        You'll see this function called to reconstruct a job context within the ``injected\n        parameters`` cell of a dagstermill output notebook. Users should not call this function\n        interactively except when debugging output notebooks.\n\n        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a\n        context for interactive exploration and development. This call will be replaced by one to\n        :func:`dagstermill.reconstitute_job_context` when the notebook is executed by\n        dagstermill.\n        """\n        check.opt_str_param(output_log_path, "output_log_path")\n        check.opt_str_param(marshal_dir, "marshal_dir")\n        run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n        check.mapping_param(job_run_dict, "job_run_dict")\n        check.mapping_param(executable_dict, "executable_dict")\n        check.mapping_param(node_handle_kwargs, "node_handle_kwargs")\n        check.mapping_param(instance_ref_dict, "instance_ref_dict")\n        check.str_param(step_key, "step_key")\n\n        job = ReconstructableJob.from_dict(executable_dict)\n        job_def = job.get_definition()\n\n        try:\n            instance_ref = unpack_value(instance_ref_dict, InstanceRef)\n            instance = DagsterInstance.from_ref(instance_ref)\n        except Exception as err:\n            raise DagstermillError(\n                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"\n            ) from err\n\n        dagster_run = unpack_value(job_run_dict, DagsterRun)\n\n        node_handle = NodeHandle.from_dict(node_handle_kwargs)\n        op = job_def.get_node(node_handle)\n        op_def = op.definition\n\n        self.marshal_dir = marshal_dir\n        self.in_job = True\n        self.op_def = op_def\n        self.job = job\n\n        ResolvedRunConfig.build(job_def, run_config)\n\n        execution_plan = create_execution_plan(\n            self.job,\n            run_config,\n            step_keys_to_execute=dagster_run.step_keys_to_execute,\n        )\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            instance,\n            scoped_resources_builder_cm=self._setup_resources,\n            # Set this flag even though we're not in test for clearer error reporting\n            raise_on_error=True,\n        ) as job_context:\n            known_state = None\n            if dagster_run.parent_run_id:\n                known_state = KnownExecutionState.build_for_reexecution(\n                    instance=instance,\n                    parent_run=check.not_none(instance.get_run_by_id(dagster_run.parent_run_id)),\n                )\n            self.context = DagstermillRuntimeExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=run_config.get("ops", {}).get(op.name, {}).get("config"),\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op.name,\n                node_handle=node_handle,\n                step_context=cast(\n                    StepExecutionContext,\n                    job_context.for_step(\n                        cast(ExecutionStep, execution_plan.get_step_by_key(step_key)),\n                        known_state=known_state,\n                    ),\n                ),\n            )\n\n        return self.context\n\n    def get_context(\n        self,\n        op_config: Any = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        run_config: Optional[dict] = None,\n    ) -> DagstermillExecutionContext:\n        """Get a dagstermill execution context for interactive exploration and development.\n\n        Args:\n            op_config (Optional[Any]): If specified, this value will be made available on the\n                context as its ``op_config`` property.\n            resource_defs (Optional[Mapping[str, ResourceDefinition]]): Specifies resources to provide to context.\n            logger_defs (Optional[Mapping[str, LoggerDefinition]]): Specifies loggers to provide to context.\n            run_config(Optional[dict]): The config dict with which to construct\n                the context.\n\n        Returns:\n            :py:class:`~dagstermill.DagstermillExecutionContext`\n        """\n        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n        # If we are running non-interactively, and there is already a context reconstituted, return\n        # that context rather than overwriting it.\n        if self.context is not None and isinstance(\n            self.context, DagstermillRuntimeExecutionContext\n        ):\n            return self.context\n\n        if not logger_defs:\n            logger_defs = {"dagstermill": colored_console_logger}\n            run_config["loggers"] = {"dagstermill": {}}\n        logger_defs = check.opt_mapping_param(logger_defs, "logger_defs")\n        resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n\n        op_def = OpDefinition(\n            name="this_op",\n            compute_fn=lambda *args, **kwargs: None,\n            description="Ephemeral op constructed by dagstermill.get_context()",\n            required_resource_keys=set(resource_defs.keys()),\n        )\n\n        job_def = JobDefinition(\n            graph_def=GraphDefinition(name="ephemeral_dagstermill_pipeline", node_defs=[op_def]),\n            logger_defs=logger_defs,\n            resource_defs=resource_defs,\n        )\n\n        run_id = make_new_run_id()\n\n        # construct stubbed DagsterRun for notebook exploration...\n        # The actual dagster run during job execution will be serialized and reconstituted\n        # in the `reconstitute_job_context` call\n        dagster_run = DagsterRun(\n            job_name=job_def.name,\n            run_id=run_id,\n            run_config=run_config,\n            step_keys_to_execute=None,\n            status=DagsterRunStatus.NOT_STARTED,\n            tags=None,\n        )\n\n        self.in_job = False\n        self.op_def = op_def\n        self.job = job_def\n\n        job = InMemoryJob(job_def)\n        execution_plan = create_execution_plan(job, run_config)\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            DagsterInstance.ephemeral(),\n            scoped_resources_builder_cm=self._setup_resources,\n        ) as job_context:\n            self.context = DagstermillExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=op_config,\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op_def.name,\n                node_handle=NodeHandle(op_def.name, parent=None),\n            )\n\n        return self.context\n\n    def yield_result(self, value, output_name="result"):\n        """Yield a result directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            value (Any): The value to yield.\n            output_name (Optional[str]): The name of the result to yield (default: ``'result'``).\n        """\n        if not self.in_job:\n            return value\n\n        # deferred import for perf\n        import scrapbook\n\n        if not self.op_def.has_output(output_name):\n            raise DagstermillError(\n                f"Op {self.op_def.name} does not have output named {output_name}.Expected one of"\n                f" {[str(output_def.name) for output_def in self.op_def.output_defs]}"\n            )\n\n        # pass output value cross process boundary using io manager\n        step_context = self.context._step_context  # noqa: SLF001\n        # Note: yield_result currently does not support DynamicOutput\n\n        # dagstermill assets do not support yielding additional results within the notebook:\n        if len(step_context.job_def.asset_layer.asset_keys) > 0:\n            raise DagstermillError(\n                "dagstermill assets do not currently support dagstermill.yield_result"\n            )\n\n        step_output_handle = StepOutputHandle(\n            step_key=step_context.step.key, output_name=output_name\n        )\n        output_context = step_context.get_output_context(step_output_handle)\n        io_manager = step_context.get_io_manager(step_output_handle)\n\n        # Note that we assume io manager is symmetric, i.e handle_input(handle_output(X)) == X\n        io_manager.handle_output(output_context, value)\n\n        # record that the output has been yielded\n        scrapbook.glue(output_name, "")\n\n    def yield_event(self, dagster_event):\n        """Yield a dagster event directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            dagster_event (Union[:class:`dagster.AssetMaterialization`, :class:`dagster.ExpectationResult`, :class:`dagster.TypeCheck`, :class:`dagster.Failure`, :class:`dagster.RetryRequested`]):\n                An event to yield back to Dagster.\n        """\n        valid_types = (\n            AssetMaterialization,\n            AssetObservation,\n            ExpectationResult,\n            TypeCheck,\n            Failure,\n            RetryRequested,\n        )\n        if not isinstance(dagster_event, valid_types):\n            raise DagstermillError(\n                f"Received invalid type {dagster_event} in yield_event. Expected a Dagster event"\n                f" type, one of {valid_types}."\n            )\n\n        if not self.in_job:\n            return dagster_event\n\n        # deferred import for perf\n        import scrapbook\n\n        event_id = f"event-{uuid.uuid4()}"\n        out_file_path = os.path.join(self.marshal_dir, event_id)\n        with open(out_file_path, "wb") as fd:\n            fd.write(pickle.dumps(dagster_event, PICKLE_PROTOCOL))\n\n        scrapbook.glue(event_id, out_file_path)\n\n    def teardown_resources(self):\n        if self.resource_manager is not None:\n            self.resource_manager.teardown()\n\n    def load_input_parameter(self, input_name: str):\n        # load input from source\n        dm_context = check.not_none(self.context)\n        if not isinstance(dm_context, DagstermillRuntimeExecutionContext):\n            check.failed("Expected DagstermillRuntimeExecutionContext")\n        step_context = dm_context.step_context\n        step_input = step_context.step.step_input_named(input_name)\n        input_def = step_context.op_def.input_def_named(input_name)\n        for event_or_input_value in step_input.source.load_input_object(step_context, input_def):\n            if isinstance(event_or_input_value, DagsterEvent):\n                continue\n            else:\n                return event_or_input_value\n\n\nMANAGER_FOR_NOTEBOOK_INSTANCE = Manager()\n
", "current_page_name": "_modules/dagstermill/manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.manager"}}} \ No newline at end of file +{"": {"dagster_pandera": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandera

\nimport itertools\nimport re\nfrom typing import TYPE_CHECKING, Callable, Sequence, Type, Union\n\nimport dagster._check as check\nimport pandas as pd\nimport pandera as pa\nfrom dagster import (\n    DagsterType,\n    TableColumn,\n    TableColumnConstraints,\n    TableConstraints,\n    TableSchema,\n    TypeCheck,\n    TypeCheckContext,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.libraries import DagsterLibraryRegistry\n\nfrom .version import __version__\n\n# NOTE: Pandera supports multiple dataframe libraries. Most of the alternatives\n# to pandas implement a pandas-like API wrapper around an underlying library\n# that can handle big data (a weakness of pandas). Typically this means the\n# data is only partly loaded into memory, or is distributed across multiple\n# nodes. Because Dagster types perform runtime validation within a single\n# Python process, it's not clear at present how to interface the more complex\n# validation computations on distributed dataframes with Dagster Types.\n\n# Therefore, for the time being dagster-pandera only supports pandas dataframes.\n# However, some commented-out scaffolding has been left in place for support of\n# alternatives in the future. These sections are marked with "TODO: pending\n# alternative dataframe support".\n\nif TYPE_CHECKING:\n    ValidatableDataFrame = pd.DataFrame\n\nDagsterLibraryRegistry.register("dagster-pandera", __version__)\n\n# ########################\n# ##### VALID DATAFRAME CLASSES\n# ########################\n\n# This layer of indirection is used because we may support alternative dataframe classes in the\n# future.\nVALID_DATAFRAME_CLASSES = (pd.DataFrame,)\n\n\n# ########################\n# ##### PANDERA SCHEMA TO DAGSTER TYPE\n# ########################\n\n\n
[docs]def pandera_schema_to_dagster_type(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> DagsterType:\n """Convert a Pandera dataframe schema to a `DagsterType`.\n\n The generated Dagster type will be given an automatically generated `name`. The schema's `title`\n property, `name` property, or class name (in that order) will be used. If neither `title` or\n `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.\n\n Additional metadata is also extracted from the Pandera schema and attached to the returned\n `DagsterType` as a metadata dictionary. The extracted metadata includes:\n\n - Descriptions on the schema and constituent columns and checks.\n - Data types for each column.\n - String representations of all column-wise checks.\n - String representations of all row-wise (i.e. "wide") checks.\n\n The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type\n check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all\n values in the dataframe, rather than stopping on the first error.\n\n If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:\n\n - `num_failures` total number of validation errors.\n - `failure_sample` a table containing up to the first 10 validation errors.\n\n Args:\n schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):\n\n Returns:\n DagsterType: Dagster Type constructed from the Pandera schema.\n\n """\n if not (\n isinstance(schema, pa.DataFrameSchema)\n or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))\n ):\n raise TypeError(\n "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"\n )\n\n name = _extract_name_from_pandera_schema(schema)\n norm_schema = (\n schema.to_schema()\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)\n else schema\n )\n tschema = _pandera_schema_to_table_schema(norm_schema)\n type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)\n\n return DagsterType(\n type_check_fn=type_check_fn,\n name=name,\n description=norm_schema.description,\n metadata={\n "schema": MetadataValue.table_schema(tschema),\n },\n typing_type=pd.DataFrame,\n )
\n\n\n# call next() on this to generate next unique Dagster Type name for anonymous schemas\n_anonymous_schema_name_generator = (f"DagsterPanderaDataframe{i}" for i in itertools.count(start=1))\n\n\ndef _extract_name_from_pandera_schema(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> str:\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel):\n return (\n getattr(schema.Config, "title", None)\n or getattr(schema.Config, "name", None)\n or schema.__name__\n )\n elif isinstance(schema, pa.DataFrameSchema):\n return schema.title or schema.name or next(_anonymous_schema_name_generator)\n\n\ndef _pandera_schema_to_type_check_fn(\n schema: pa.DataFrameSchema,\n table_schema: TableSchema,\n) -> Callable[[TypeCheckContext, object], TypeCheck]:\n def type_check_fn(_context, value: object) -> TypeCheck:\n if isinstance(value, VALID_DATAFRAME_CLASSES):\n try:\n # `lazy` instructs pandera to capture every (not just the first) validation error\n schema.validate(value, lazy=True)\n except pa.errors.SchemaErrors as e:\n return _pandera_errors_to_type_check(e, table_schema)\n except Exception as e:\n return TypeCheck(\n success=False,\n description=f"Unexpected error during validation: {e}",\n )\n else:\n return TypeCheck(\n success=False,\n description=(\n f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check_fn\n\n\nPANDERA_FAILURE_CASES_SCHEMA = TableSchema(\n columns=[\n TableColumn(\n name="schema_context",\n type="string",\n description="`Column` for column-wise checks, or `DataFrameSchema`",\n ),\n TableColumn(\n name="column",\n type="string",\n description="Column of value that failed the check, or `None` for wide checks.",\n ),\n TableColumn(\n name="check", type="string", description="Description of the failed Pandera check."\n ),\n TableColumn(name="check_number", description="Index of the failed check."),\n TableColumn(\n name="failure_case", type="number | string", description="Value that failed a check."\n ),\n TableColumn(\n name="index",\n type="number | string",\n description="Index (row) of value that failed a check.",\n ),\n ]\n)\n\n\ndef _pandera_errors_to_type_check(\n error: pa.errors.SchemaErrors, _table_schema: TableSchema\n) -> TypeCheck:\n return TypeCheck(\n success=False,\n description=str(error),\n )\n\n\ndef _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema:\n df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks)\n columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()]\n return TableSchema(columns=columns, constraints=df_constraints)\n\n\ndef _pandera_schema_wide_checks_to_table_constraints(\n checks: Sequence[Union[pa.Check, pa.Hypothesis]]\n) -> TableConstraints:\n return TableConstraints(other=[_pandera_check_to_table_constraint(check) for check in checks])\n\n\ndef _pandera_check_to_table_constraint(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _pandera_column_to_table_column(pa_column: pa.Column) -> TableColumn:\n constraints = TableColumnConstraints(\n nullable=pa_column.nullable,\n unique=pa_column.unique,\n other=[_pandera_check_to_column_constraint(pa_check) for pa_check in pa_column.checks],\n )\n name = check.not_none(pa_column.name, "name")\n name = name if isinstance(name, str) else "/".join(name)\n return TableColumn(\n name=name,\n type=str(pa_column.dtype),\n description=pa_column.description,\n constraints=constraints,\n )\n\n\nCHECK_OPERATORS = {\n "equal_to": "==",\n "not_equal_to": "!=",\n "less_than": "<",\n "less_than_or_equal_to": "<=",\n "greater_than": ">",\n "greater_than_or_equal_to": ">=",\n}\n\n\ndef _extract_operand(error_str: str) -> str:\n match = re.search(r"(?<=\\().+(?=\\))", error_str)\n return match.group(0) if match else ""\n\n\ndef _pandera_check_to_column_constraint(pa_check: pa.Check) -> str:\n if pa_check.description:\n return pa_check.description\n elif pa_check.name in CHECK_OPERATORS:\n assert isinstance(\n pa_check.error, str\n ), "Expected pandera check to have string `error` attr."\n return f"{CHECK_OPERATORS[pa_check.name]} {_extract_operand(pa_check.error)}"\n else:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _get_pandera_check_identifier(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return pa_check.description or pa_check.error or pa_check.name or str(pa_check)\n\n\n__all__ = [\n "pandera_schema_to_dagster_type",\n]\n
", "current_page_name": "_modules/dagster_pandera", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandera"}, "dagster_pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pipes

\nimport base64\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\nimport time\nimport warnings\nimport zlib\nfrom abc import ABC, abstractmethod\nfrom contextlib import ExitStack, contextmanager\nfrom io import StringIO\nfrom queue import Queue\nfrom threading import Event, Thread\nfrom typing import (\n    IO,\n    TYPE_CHECKING,\n    Any,\n    ClassVar,\n    Dict,\n    Generic,\n    Iterable,\n    Iterator,\n    Literal,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    TextIO,\n    Type,\n    TypedDict,\n    TypeVar,\n    Union,\n    cast,\n    get_args,\n)\n\nif TYPE_CHECKING:\n    from unittest.mock import MagicMock\n\n# ########################\n# ##### PROTOCOL\n# ########################\n\n# This represents the version of the protocol, rather than the version of the package. It must be\n# manually updated whenever there are changes to the protocol.\nPIPES_PROTOCOL_VERSION = "0.1"\n\nPipesExtras = Mapping[str, Any]\nPipesParams = Mapping[str, Any]\n\n\n# ##### MESSAGE\n\n\ndef _make_message(method: str, params: Optional[Mapping[str, Any]]) -> "PipesMessage":\n    return {\n        PIPES_PROTOCOL_VERSION_FIELD: PIPES_PROTOCOL_VERSION,\n        "method": method,\n        "params": params,\n    }\n\n\n# Can't use a constant for TypedDict key so this value is repeated in `ExtMessage` defn.\nPIPES_PROTOCOL_VERSION_FIELD = "__dagster_pipes_version"\n\n\nclass PipesMessage(TypedDict):\n    """A message sent from the external process to the orchestration process."""\n\n    __dagster_pipes_version: str\n    method: str\n    params: Optional[Mapping[str, Any]]\n\n\n###### PIPES CONTEXT\n\n\nclass PipesContextData(TypedDict):\n    """The serializable data passed from the orchestration process to the external process. This gets\n    wrapped in a :py:class:`PipesContext`.\n    """\n\n    asset_keys: Optional[Sequence[str]]\n    code_version_by_asset_key: Optional[Mapping[str, Optional[str]]]\n    provenance_by_asset_key: Optional[Mapping[str, Optional["PipesDataProvenance"]]]\n    partition_key: Optional[str]\n    partition_key_range: Optional["PipesPartitionKeyRange"]\n    partition_time_window: Optional["PipesTimeWindow"]\n    run_id: str\n    job_name: Optional[str]\n    retry_number: int\n    extras: Mapping[str, Any]\n\n\nclass PipesPartitionKeyRange(TypedDict):\n    """A range of partition keys."""\n\n    start: str\n    end: str\n\n\nclass PipesTimeWindow(TypedDict):\n    """A span of time delimited by a start and end timestamp. This is defined for time-based partitioning schemes."""\n\n    start: str  # timestamp\n    end: str  # timestamp\n\n\nclass PipesDataProvenance(TypedDict):\n    """Provenance information for an asset."""\n\n    code_version: str\n    input_data_versions: Mapping[str, str]\n    is_user_provided: bool\n\n\nPipesAssetCheckSeverity = Literal["WARN", "ERROR"]\n\nPipesMetadataRawValue = Union[int, float, str, Mapping[str, Any], Sequence[Any], bool, None]\n\n\nclass PipesMetadataValue(TypedDict):\n    type: "PipesMetadataType"\n    raw_value: PipesMetadataRawValue\n\n\n# Infer the type from the raw value on the orchestration end\nPIPES_METADATA_TYPE_INFER = "__infer__"\n\nPipesMetadataType = Literal[\n    "__infer__",\n    "text",\n    "url",\n    "path",\n    "notebook",\n    "json",\n    "md",\n    "float",\n    "int",\n    "bool",\n    "dagster_run",\n    "asset",\n    "null",\n]\n\n# ########################\n# ##### UTIL\n# ########################\n\n_T = TypeVar("_T")\n\n\n
[docs]class DagsterPipesError(Exception):\n pass
\n\n\n
[docs]class DagsterPipesWarning(Warning):\n pass
\n\n\ndef _assert_not_none(value: Optional[_T], desc: Optional[str] = None) -> _T:\n if value is None:\n raise DagsterPipesError(f"Missing required property: {desc}")\n return value\n\n\ndef _assert_defined_asset_property(value: Optional[_T], key: str) -> _T:\n return _assert_not_none(value, f"`{key}` is undefined. Current step does not target an asset.")\n\n\n# This should only be called under the precondition that the current step targets assets.\ndef _assert_single_asset(data: PipesContextData, key: str) -> None:\n asset_keys = data["asset_keys"]\n assert asset_keys is not None\n if len(asset_keys) != 1:\n raise DagsterPipesError(f"`{key}` is undefined. Current step targets multiple assets.")\n\n\ndef _resolve_optionally_passed_asset_key(\n data: PipesContextData,\n asset_key: Optional[str],\n method: str,\n) -> str:\n asset_key = _assert_opt_param_type(asset_key, str, method, "asset_key")\n\n defined_asset_keys = data["asset_keys"]\n if defined_asset_keys:\n if asset_key and asset_key not in defined_asset_keys:\n raise DagsterPipesError(\n f"Invalid asset key. Expected one of `{defined_asset_keys}`, got `{asset_key}`."\n )\n if not asset_key:\n if len(defined_asset_keys) != 1:\n raise DagsterPipesError(\n f"Calling `{method}` without passing an asset key is undefined. Current step"\n " targets multiple assets."\n )\n asset_key = defined_asset_keys[0]\n\n if not asset_key:\n raise DagsterPipesError(\n f"Calling `{method}` without passing an asset key is undefined. Current step"\n " does not target a specific asset."\n )\n\n return asset_key\n\n\ndef _assert_defined_partition_property(value: Optional[_T], key: str) -> _T:\n return _assert_not_none(\n value, f"`{key}` is undefined. Current step does not target any partitions."\n )\n\n\n# This should only be called under the precondition that the current steps targets assets.\ndef _assert_single_partition(data: PipesContextData, key: str) -> None:\n partition_key_range = data["partition_key_range"]\n assert partition_key_range is not None\n if partition_key_range["start"] != partition_key_range["end"]:\n raise DagsterPipesError(f"`{key}` is undefined. Current step targets multiple partitions.")\n\n\ndef _assert_defined_extra(extras: PipesExtras, key: str) -> Any:\n if key not in extras:\n raise DagsterPipesError(f"Extra `{key}` is undefined. Extras must be provided by user.")\n return extras[key]\n\n\ndef _assert_param_type(value: _T, expected_type: Any, method: str, param: str) -> _T:\n if not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected `{expected_type}`, got"\n f" `{type(value)}`."\n )\n return value\n\n\ndef _assert_opt_param_type(value: _T, expected_type: Any, method: str, param: str) -> _T:\n if not (isinstance(value, expected_type) or value is None):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected"\n f" `Optional[{expected_type}]`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_env_param_type(\n env_params: PipesParams, key: str, expected_type: Type[_T], cls: Type\n) -> _T:\n value = env_params.get(key)\n if not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{key}` passed from orchestration side to"\n f" `{cls.__name__}`. Expected `{expected_type}`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_opt_env_param_type(\n env_params: PipesParams, key: str, expected_type: Type[_T], cls: Type\n) -> Optional[_T]:\n value = env_params.get(key)\n if value is not None and not isinstance(value, expected_type):\n raise DagsterPipesError(\n f"Invalid type for parameter `{key}` passed from orchestration side to"\n f" `{cls.__name__}`. Expected `Optional[{expected_type}]`, got `{type(value)}`."\n )\n return value\n\n\ndef _assert_param_value(value: _T, expected_values: Iterable[_T], method: str, param: str) -> _T:\n if value not in expected_values:\n raise DagsterPipesError(\n f"Invalid value for parameter `{param}` of `{method}`. Expected one of"\n f" `{expected_values}`, got `{value}`."\n )\n return value\n\n\ndef _assert_opt_param_value(\n value: _T, expected_values: Sequence[_T], method: str, param: str\n) -> _T:\n if value is not None and value not in expected_values:\n raise DagsterPipesError(\n f"Invalid value for parameter `{param}` of `{method}`. Expected one of"\n f" `{expected_values}`, got `{value}`."\n )\n return value\n\n\ndef _json_serialize_param(value: Any, method: str, param: str) -> str:\n try:\n serialized = json.dumps(value)\n except (TypeError, OverflowError):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a JSON-serializable"\n f" type, got `{type(value)}`."\n )\n return serialized\n\n\n_METADATA_VALUE_KEYS = frozenset(PipesMetadataValue.__annotations__.keys())\n_METADATA_TYPES = frozenset(get_args(PipesMetadataType))\n\n\ndef _normalize_param_metadata(\n metadata: Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]],\n method: str,\n param: str,\n) -> Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]:\n _assert_param_type(metadata, dict, method, param)\n new_metadata: Dict[str, PipesMetadataValue] = {}\n for key, value in metadata.items():\n if not isinstance(key, str):\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a dict with string"\n f" keys, got a key `{key}` of type `{type(key)}`."\n )\n elif isinstance(value, dict):\n if not {*value.keys()} == _METADATA_VALUE_KEYS:\n raise DagsterPipesError(\n f"Invalid type for parameter `{param}` of `{method}`. Expected a dict with"\n " string keys and values that are either raw metadata values or dictionaries"\n f" with schema `{{raw_value: ..., type: ...}}`. Got a value `{value}`."\n )\n _assert_param_value(value["type"], _METADATA_TYPES, method, f"{param}.{key}.type")\n new_metadata[key] = cast(PipesMetadataValue, value)\n else:\n new_metadata[key] = {"raw_value": value, "type": PIPES_METADATA_TYPE_INFER}\n return new_metadata\n\n\ndef _param_from_env_var(env_var: str) -> Any:\n raw_value = os.environ.get(env_var)\n return decode_env_var(raw_value) if raw_value is not None else None\n\n\n
[docs]def encode_env_var(value: Any) -> str:\n """Encode value by serializing to JSON, compressing with zlib, and finally encoding with base64.\n `base64_encode(compress(to_json(value)))` in function notation.\n\n Args:\n value (Any): The value to encode. Must be JSON-serializable.\n\n Returns:\n str: The encoded value.\n """\n serialized = _json_serialize_param(value, "encode_env_var", "value")\n compressed = zlib.compress(serialized.encode("utf-8"))\n encoded = base64.b64encode(compressed)\n return encoded.decode("utf-8") # as string
\n\n\n
[docs]def decode_env_var(value: str) -> Any:\n """Decode a value by decoding from base64, decompressing with zlib, and finally deserializing from\n JSON. `from_json(decompress(base64_decode(value)))` in function notation.\n\n Args:\n value (Any): The value to decode.\n\n Returns:\n Any: The decoded value.\n """\n decoded = base64.b64decode(value)\n decompressed = zlib.decompress(decoded)\n return json.loads(decompressed.decode("utf-8"))
\n\n\ndef _emit_orchestration_inactive_warning() -> None:\n warnings.warn(\n "This process was not launched by a Dagster orchestration process. All calls to the"\n " `dagster-pipes` context or attempts to initialize `dagster-pipes` abstractions"\n " are no-ops.",\n category=DagsterPipesWarning,\n )\n\n\ndef _get_mock() -> "MagicMock":\n from unittest.mock import MagicMock\n\n return MagicMock()\n\n\nclass _PipesLogger(logging.Logger):\n def __init__(self, context: "PipesContext") -> None:\n super().__init__(name="dagster-pipes")\n self.addHandler(_PipesLoggerHandler(context))\n\n\nclass _PipesLoggerHandler(logging.Handler):\n def __init__(self, context: "PipesContext") -> None:\n super().__init__()\n self._context = context\n\n def emit(self, record: logging.LogRecord) -> None:\n self._context._write_message( # noqa: SLF001\n "log", {"message": record.getMessage(), "level": record.levelname}\n )\n\n\n# ########################\n# ##### IO - BASE\n# ########################\n\n\n
[docs]class PipesContextLoader(ABC):\n
[docs] @abstractmethod\n @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n """A `@contextmanager` that loads context data injected by the orchestration process.\n\n This method should read and yield the context data from the location specified by the passed in\n `PipesParams`.\n\n Args:\n params (PipesParams): The params provided by the context injector in the orchestration\n process.\n\n Yields:\n PipesContextData: The context data.\n """
\n\n\nT_MessageChannel = TypeVar("T_MessageChannel", bound="PipesMessageWriterChannel")\n\n\n
[docs]class PipesMessageWriter(ABC, Generic[T_MessageChannel]):\n
[docs] @abstractmethod\n @contextmanager\n def open(self, params: PipesParams) -> Iterator[T_MessageChannel]:\n """A `@contextmanager` that initializes a channel for writing messages back to Dagster.\n\n This method should takes the params passed by the orchestration-side\n :py:class:`PipesMessageReader` and use them to construct and yield a\n :py:class:`PipesMessageWriterChannel`.\n\n Args:\n params (PipesParams): The params provided by the message reader in the orchestration\n process.\n\n Yields:\n PipesMessageWriterChannel: Channel for writing messagse back to Dagster.\n """
\n\n\n
[docs]class PipesMessageWriterChannel(ABC, Generic[T_MessageChannel]):\n """Object that writes messages back to the Dagster orchestration process."""\n\n
[docs] @abstractmethod\n def write_message(self, message: PipesMessage) -> None:\n """Write a message to the orchestration process.\n\n Args:\n message (PipesMessage): The message to write.\n """
\n\n\n
[docs]class PipesParamsLoader(ABC):\n """Object that loads params passed from the orchestration process by the context injector and\n message reader. These params are used to respectively bootstrap the\n :py:class:`PipesContextLoader` and :py:class:`PipesMessageWriter`.\n """\n\n
[docs] @abstractmethod\n def is_dagster_pipes_process(self) -> bool:\n """Whether or not this process has been provided with provided with information to create\n a PipesContext or should instead return a mock.\n """
\n\n
[docs] @abstractmethod\n def load_context_params(self) -> PipesParams:\n """PipesParams: Load params passed by the orchestration-side context injector."""
\n\n
[docs] @abstractmethod\n def load_messages_params(self) -> PipesParams:\n """PipesParams: Load params passed by the orchestration-side message reader."""
\n\n\nT_BlobStoreMessageWriterChannel = TypeVar(\n "T_BlobStoreMessageWriterChannel", bound="PipesBlobStoreMessageWriterChannel"\n)\n\n\n
[docs]class PipesBlobStoreMessageWriter(PipesMessageWriter[T_BlobStoreMessageWriterChannel]):\n """Message writer channel that periodically uploads message chunks to some blob store endpoint."""\n\n def __init__(self, *, interval: float = 10):\n self.interval = interval\n\n
[docs] @contextmanager\n def open(self, params: PipesParams) -> Iterator[T_BlobStoreMessageWriterChannel]:\n """Construct and yield a :py:class:`PipesBlobStoreMessageWriterChannel`.\n\n Args:\n params (PipesParams): The params provided by the message reader in the orchestration\n process.\n\n Yields:\n PipesBlobStoreMessageWriterChannel: Channel that periodically uploads message chunks to\n a blob store.\n """\n channel = self.make_channel(params)\n with channel.buffered_upload_loop():\n yield channel
\n\n
[docs] @abstractmethod\n def make_channel(self, params: PipesParams) -> T_BlobStoreMessageWriterChannel: ...
\n\n\n
[docs]class PipesBlobStoreMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that periodically uploads message chunks to some blob store endpoint."""\n\n def __init__(self, *, interval: float = 10):\n self._interval = interval\n self._buffer: Queue[PipesMessage] = Queue()\n self._counter = 1\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n self._buffer.put(message)
\n\n
[docs] def flush_messages(self) -> Sequence[PipesMessage]:\n items = []\n while not self._buffer.empty():\n items.append(self._buffer.get())\n return items
\n\n
[docs] @abstractmethod\n def upload_messages_chunk(self, payload: StringIO, index: int) -> None: ...
\n\n
[docs] @contextmanager\n def buffered_upload_loop(self) -> Iterator[None]:\n thread = None\n is_task_complete = Event()\n try:\n thread = Thread(target=self._upload_loop, args=(is_task_complete,), daemon=True)\n thread.start()\n yield\n finally:\n is_task_complete.set()\n if thread:\n thread.join(timeout=60)
\n\n def _upload_loop(self, is_task_complete: Event) -> None:\n start_or_last_upload = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if self._buffer.empty() and is_task_complete.is_set():\n break\n elif is_task_complete.is_set() or (now - start_or_last_upload).seconds > self._interval:\n payload = "\\n".join([json.dumps(message) for message in self.flush_messages()])\n if len(payload) > 0:\n self.upload_messages_chunk(StringIO(payload), self._counter)\n start_or_last_upload = now\n self._counter += 1\n time.sleep(1)
\n\n\n
[docs]class PipesBufferedFilesystemMessageWriterChannel(PipesBlobStoreMessageWriterChannel):\n """Message writer channel that periodically writes message chunks to an endpoint mounted on the filesystem.\n\n Args:\n interval (float): interval in seconds between chunk uploads\n """\n\n def __init__(self, path: str, *, interval: float = 10):\n super().__init__(interval=interval)\n self._path = path\n\n
[docs] def upload_messages_chunk(self, payload: IO, index: int) -> None:\n message_path = os.path.join(self._path, f"{index}.json")\n with open(message_path, "w") as f:\n f.write(payload.read())
\n\n\n# ########################\n# ##### IO - DEFAULT\n# ########################\n\n\n
[docs]class PipesDefaultContextLoader(PipesContextLoader):\n """Context loader that loads context data from either a file or directly from the provided params.\n\n The location of the context data is configured by the params received by the loader. If the params\n include a key `path`, then the context data will be loaded from a file at the specified path. If\n the params instead include a key `data`, then the corresponding value should be a dict\n representing the context data.\n """\n\n FILE_PATH_KEY = "path"\n DIRECT_KEY = "data"\n\n
[docs] @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n if self.FILE_PATH_KEY in params:\n path = _assert_env_param_type(params, self.FILE_PATH_KEY, str, self.__class__)\n with open(path, "r") as f:\n data = json.load(f)\n yield data\n elif self.DIRECT_KEY in params:\n data = _assert_env_param_type(params, self.DIRECT_KEY, dict, self.__class__)\n yield cast(PipesContextData, data)\n else:\n raise DagsterPipesError(\n f'Invalid params for {self.__class__.__name__}, expected key "{self.FILE_PATH_KEY}"'\n f' or "{self.DIRECT_KEY}", received {params}',\n )
\n\n\n
[docs]class PipesDefaultMessageWriter(PipesMessageWriter):\n """Message writer that writes messages to either a file or the stdout or stderr stream.\n\n The write location is configured by the params received by the writer. If the params include a\n key `path`, then messages will be written to a file at the specified path. If the params instead\n include a key `stdio`, then messages then the corresponding value must specify either `stderr`\n or `stdout`, and messages will be written to the selected stream.\n """\n\n FILE_PATH_KEY = "path"\n STDIO_KEY = "stdio"\n STDERR = "stderr"\n STDOUT = "stdout"\n\n
[docs] @contextmanager\n def open(self, params: PipesParams) -> Iterator[PipesMessageWriterChannel]:\n if self.FILE_PATH_KEY in params:\n path = _assert_env_param_type(params, self.FILE_PATH_KEY, str, self.__class__)\n yield PipesFileMessageWriterChannel(path)\n elif self.STDIO_KEY in params:\n stream = _assert_env_param_type(params, self.STDIO_KEY, str, self.__class__)\n if stream == self.STDERR:\n yield PipesStreamMessageWriterChannel(sys.stderr)\n elif stream == self.STDOUT:\n yield PipesStreamMessageWriterChannel(sys.stdout)\n else:\n raise DagsterPipesError(\n f'Invalid value for key "std", expected "{self.STDERR}" or "{self.STDOUT}" but'\n f" received {stream}"\n )\n else:\n raise DagsterPipesError(\n f'Invalid params for {self.__class__.__name__}, expected key "path" or "std",'\n f" received {params}"\n )
\n\n\n
[docs]class PipesFileMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that writes one message per line to a file."""\n\n def __init__(self, path: str):\n self._path = path\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n with open(self._path, "a") as f:\n f.write(json.dumps(message) + "\\n")
\n\n\n
[docs]class PipesStreamMessageWriterChannel(PipesMessageWriterChannel):\n """Message writer channel that writes one message per line to a `TextIO` stream."""\n\n def __init__(self, stream: TextIO):\n self._stream = stream\n\n
[docs] def write_message(self, message: PipesMessage) -> None:\n self._stream.writelines((json.dumps(message), "\\n"))
\n\n\nDAGSTER_PIPES_CONTEXT_ENV_VAR = "DAGSTER_PIPES_CONTEXT"\nDAGSTER_PIPES_MESSAGES_ENV_VAR = "DAGSTER_PIPES_MESSAGES"\n\n\n
[docs]class PipesEnvVarParamsLoader(PipesParamsLoader):\n """Params loader that extracts params from environment variables."""\n\n
[docs] def is_dagster_pipes_process(self) -> bool:\n # use the presence of DAGSTER_PIPES_CONTEXT to discern if we are in a pipes process\n return DAGSTER_PIPES_CONTEXT_ENV_VAR in os.environ
\n\n
[docs] def load_context_params(self) -> PipesParams:\n return _param_from_env_var(DAGSTER_PIPES_CONTEXT_ENV_VAR)
\n\n
[docs] def load_messages_params(self) -> PipesParams:\n return _param_from_env_var(DAGSTER_PIPES_MESSAGES_ENV_VAR)
\n\n\n# ########################\n# ##### IO - S3\n# ########################\n\n\n
[docs]class PipesS3MessageWriter(PipesBlobStoreMessageWriter):\n """Message writer that writes messages by periodically writing message chunks to an S3 bucket.\n\n Args:\n client (Any): A boto3.client("s3") object.\n interval (float): interval in seconds between upload chunk uploads\n """\n\n # client is a boto3.client("s3") object\n def __init__(self, client: Any, *, interval: float = 10):\n super().__init__(interval=interval)\n # Not checking client type for now because it's a boto3.client object and we don't want to\n # depend on boto3.\n self._client = client\n\n
[docs] def make_channel(\n self,\n params: PipesParams,\n ) -> "PipesS3MessageWriterChannel":\n bucket = _assert_env_param_type(params, "bucket", str, self.__class__)\n key_prefix = _assert_opt_env_param_type(params, "key_prefix", str, self.__class__)\n return PipesS3MessageWriterChannel(\n client=self._client,\n bucket=bucket,\n key_prefix=key_prefix,\n interval=self.interval,\n )
\n\n\n
[docs]class PipesS3MessageWriterChannel(PipesBlobStoreMessageWriterChannel):\n """Message writer channel for writing messages by periodically writing message chunks to an S3 bucket.\n\n Args:\n client (Any): A boto3.client("s3") object.\n bucket (str): The name of the S3 bucket to write to.\n key_prefix (Optional[str]): An optional prefix to use for the keys of written blobs.\n interval (float): interval in seconds between upload chunk uploads\n """\n\n # client is a boto3.client("s3") object\n def __init__(\n self, client: Any, bucket: str, key_prefix: Optional[str], *, interval: float = 10\n ):\n super().__init__(interval=interval)\n self._client = client\n self._bucket = bucket\n self._key_prefix = key_prefix\n\n
[docs] def upload_messages_chunk(self, payload: IO, index: int) -> None:\n key = f"{self._key_prefix}/{index}.json" if self._key_prefix else f"{index}.json"\n self._client.put_object(\n Body=payload.read(),\n Bucket=self._bucket,\n Key=key,\n )
\n\n\n# ########################\n# ##### IO - DBFS\n# ########################\n\n\n
[docs]class PipesDbfsContextLoader(PipesContextLoader):\n """Context loader that reads context from a JSON file on DBFS."""\n\n
[docs] @contextmanager\n def load_context(self, params: PipesParams) -> Iterator[PipesContextData]:\n unmounted_path = _assert_env_param_type(params, "path", str, self.__class__)\n path = os.path.join("/dbfs", unmounted_path.lstrip("/"))\n with open(path, "r") as f:\n yield json.load(f)
\n\n\n
[docs]class PipesDbfsMessageWriter(PipesBlobStoreMessageWriter):\n """Message writer that writes messages by periodically writing message chunks to a directory on DBFS."""\n\n
[docs] def make_channel(\n self,\n params: PipesParams,\n ) -> "PipesBufferedFilesystemMessageWriterChannel":\n unmounted_path = _assert_env_param_type(params, "path", str, self.__class__)\n return PipesBufferedFilesystemMessageWriterChannel(\n path=os.path.join("/dbfs", unmounted_path.lstrip("/")),\n interval=self.interval,\n )
\n\n\n# ########################\n# ##### CONTEXT\n# ########################\n\n\n
[docs]def open_dagster_pipes(\n *,\n context_loader: Optional[PipesContextLoader] = None,\n message_writer: Optional[PipesMessageWriter] = None,\n params_loader: Optional[PipesParamsLoader] = None,\n) -> "PipesContext":\n """Initialize the Dagster Pipes context.\n\n This function should be called near the entry point of a pipes process. It will load injected\n context information from Dagster and spin up the machinery for streaming messages back to\n Dagster.\n\n If the process was not launched by Dagster, this function will emit a warning and return a\n `MagicMock` object. This should make all operations on the context no-ops and prevent your code\n from crashing.\n\n Args:\n context_loader (Optional[PipesContextLoader]): The context loader to use. Defaults to\n :py:class:`PipesDefaultContextLoader`.\n message_writer (Optional[PipesMessageWriter]): The message writer to use. Defaults to\n :py:class:`PipesDefaultMessageWriter`.\n params_loader (Optional[PipesParamsLoader]): The params loader to use. Defaults to\n :py:class:`PipesEnvVarParamsLoader`.\n\n Returns:\n PipesContext: The initialized context.\n """\n if PipesContext.is_initialized():\n return PipesContext.get()\n\n params_loader = params_loader or PipesEnvVarParamsLoader()\n if params_loader.is_dagster_pipes_process():\n context_loader = context_loader or PipesDefaultContextLoader()\n message_writer = message_writer or PipesDefaultMessageWriter()\n context = PipesContext(params_loader, context_loader, message_writer)\n else:\n _emit_orchestration_inactive_warning()\n context = _get_mock()\n PipesContext.set(context)\n return context
\n\n\n
[docs]class PipesContext:\n """The context for a Dagster Pipes process.\n\n This class is analogous to :py:class:`~dagster.OpExecutionContext` on the Dagster side of the Pipes\n connection. It provides access to information such as the asset key(s) and partition key(s) in\n scope for the current step. It also provides methods for logging and emitting results that will\n be streamed back to Dagster.\n\n This class should not be directly instantiated by the user. Instead it should be initialized by\n calling :py:func:`open_dagster_pipes()`, which will return the singleton instance of this class.\n After `open_dagster_pipes()` has been called, the singleton instance can also be retrieved by\n calling :py:func:`PipesContext.get`.\n """\n\n _instance: ClassVar[Optional["PipesContext"]] = None\n\n
[docs] @classmethod\n def is_initialized(cls) -> bool:\n """bool: Whether the context has been initialized."""\n return cls._instance is not None
\n\n
[docs] @classmethod\n def set(cls, context: "PipesContext") -> None:\n """Set the singleton instance of the context."""\n cls._instance = context
\n\n
[docs] @classmethod\n def get(cls) -> "PipesContext":\n """Get the singleton instance of the context. Raises an error if the context has not been initialized."""\n if cls._instance is None:\n raise Exception(\n "PipesContext has not been initialized. You must call `open_dagster_pipes()`."\n )\n return cls._instance
\n\n def __init__(\n self,\n params_loader: PipesParamsLoader,\n context_loader: PipesContextLoader,\n message_writer: PipesMessageWriter,\n ) -> None:\n context_params = params_loader.load_context_params()\n messages_params = params_loader.load_messages_params()\n self._io_stack = ExitStack()\n self._data = self._io_stack.enter_context(context_loader.load_context(context_params))\n self._message_channel = self._io_stack.enter_context(message_writer.open(messages_params))\n self._message_channel.write_message(_make_message("opened", {}))\n self._logger = _PipesLogger(self)\n self._materialized_assets: Set[str] = set()\n self._closed: bool = False\n\n def __enter__(self) -> "PipesContext":\n return self\n\n def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:\n self.close()\n\n
[docs] def close(self) -> None:\n """Close the pipes connection. This will flush all buffered messages to the orchestration\n process and cause any further attempt to write a message to raise an error. This method is\n idempotent-- subsequent calls after the first have no effect.\n """\n if not self._closed:\n self._message_channel.write_message(_make_message("closed", {}))\n self._io_stack.close()\n self._closed = True
\n\n @property\n def is_closed(self) -> bool:\n """bool: Whether the context has been closed."""\n return self._closed\n\n def _write_message(self, method: str, params: Optional[Mapping[str, Any]] = None) -> None:\n if self._closed:\n raise DagsterPipesError("Cannot send message after pipes context is closed.")\n message = _make_message(method, params)\n self._message_channel.write_message(message)\n\n # ########################\n # ##### PUBLIC API\n # ########################\n\n @property\n def is_asset_step(self) -> bool:\n """bool: Whether the current step targets assets."""\n return self._data["asset_keys"] is not None\n\n @property\n def asset_key(self) -> str:\n """str: The AssetKey for the currently scoped asset. Raises an error if 0 or multiple assets\n are in scope.\n """\n asset_keys = _assert_defined_asset_property(self._data["asset_keys"], "asset_key")\n _assert_single_asset(self._data, "asset_key")\n return asset_keys[0]\n\n @property\n def asset_keys(self) -> Sequence[str]:\n """Sequence[str]: The AssetKeys for the currently scoped assets. Raises an error if no\n assets are in scope.\n """\n asset_keys = _assert_defined_asset_property(self._data["asset_keys"], "asset_keys")\n return asset_keys\n\n @property\n def provenance(self) -> Optional[PipesDataProvenance]:\n """Optional[PipesDataProvenance]: The provenance for the currently scoped asset. Raises an\n error if 0 or multiple assets are in scope.\n """\n provenance_by_asset_key = _assert_defined_asset_property(\n self._data["provenance_by_asset_key"], "provenance"\n )\n _assert_single_asset(self._data, "provenance")\n return next(iter(provenance_by_asset_key.values()))\n\n @property\n def provenance_by_asset_key(self) -> Mapping[str, Optional[PipesDataProvenance]]:\n """Mapping[str, Optional[PipesDataProvenance]]: Mapping of asset key to provenance for the\n currently scoped assets. Raises an error if no assets are in scope.\n """\n provenance_by_asset_key = _assert_defined_asset_property(\n self._data["provenance_by_asset_key"], "provenance_by_asset_key"\n )\n return provenance_by_asset_key\n\n @property\n def code_version(self) -> Optional[str]:\n """Optional[str]: The code version for the currently scoped asset. Raises an error if 0 or\n multiple assets are in scope.\n """\n code_version_by_asset_key = _assert_defined_asset_property(\n self._data["code_version_by_asset_key"], "code_version"\n )\n _assert_single_asset(self._data, "code_version")\n return next(iter(code_version_by_asset_key.values()))\n\n @property\n def code_version_by_asset_key(self) -> Mapping[str, Optional[str]]:\n """Mapping[str, Optional[str]]: Mapping of asset key to code version for the currently\n scoped assets. Raises an error if no assets are in scope.\n """\n code_version_by_asset_key = _assert_defined_asset_property(\n self._data["code_version_by_asset_key"], "code_version_by_asset_key"\n )\n return code_version_by_asset_key\n\n @property\n def is_partition_step(self) -> bool:\n """bool: Whether the current step is scoped to one or more partitions."""\n return self._data["partition_key_range"] is not None\n\n @property\n def partition_key(self) -> str:\n """str: The partition key for the currently scoped partition. Raises an error if 0 or\n multiple partitions are in scope.\n """\n partition_key = _assert_defined_partition_property(\n self._data["partition_key"], "partition_key"\n )\n return partition_key\n\n @property\n def partition_key_range(self) -> "PipesPartitionKeyRange":\n """PipesPartitionKeyRange: The partition key range for the currently scoped partition or\n partitions. Raises an error if no partitions are in scope.\n """\n partition_key_range = _assert_defined_partition_property(\n self._data["partition_key_range"], "partition_key_range"\n )\n return partition_key_range\n\n @property\n def partition_time_window(self) -> Optional["PipesTimeWindow"]:\n """Optional[PipesTimeWindow]: The partition time window for the currently scoped partition\n or partitions. Returns None if partitions in scope are not temporal. Raises an error if no\n partitions are in scope.\n """\n # None is a valid value for partition_time_window, but we check that a partition key range\n # is defined.\n _assert_defined_partition_property(\n self._data["partition_key_range"], "partition_time_window"\n )\n return self._data["partition_time_window"]\n\n @property\n def run_id(self) -> str:\n """str: The run ID for the currently executing pipeline run."""\n return self._data["run_id"]\n\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The job name for the currently executing run. Returns None if the run is\n not derived from a job.\n """\n return self._data["job_name"]\n\n @property\n def retry_number(self) -> int:\n """int: The retry number for the currently executing run."""\n return self._data["retry_number"]\n\n
[docs] def get_extra(self, key: str) -> Any:\n """Get the value of an extra provided by the user. Raises an error if the extra is not defined.\n\n Args:\n key (str): The key of the extra.\n\n Returns:\n Any: The value of the extra.\n """\n return _assert_defined_extra(self._data["extras"], key)
\n\n @property\n def extras(self) -> Mapping[str, Any]:\n """Mapping[str, Any]: Key-value map for all extras provided by the user."""\n return self._data["extras"]\n\n # ##### WRITE\n\n
[docs] def report_asset_materialization(\n self,\n metadata: Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]] = None,\n data_version: Optional[str] = None,\n asset_key: Optional[str] = None,\n ) -> None:\n """Report to Dagster that an asset has been materialized. Streams a payload containing\n materialization information back to Dagster. If no assets are in scope, raises an error.\n\n Args:\n metadata (Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]]):\n Metadata for the materialized asset. Defaults to None.\n data_version (Optional[str]): The data version for the materialized asset.\n Defaults to None.\n asset_key (Optional[str]): The asset key for the materialized asset. If only a\n single asset is in scope, default to that asset's key. If multiple assets are in scope,\n this must be set explicitly or an error will be raised.\n """\n asset_key = _resolve_optionally_passed_asset_key(\n self._data, asset_key, "report_asset_materialization"\n )\n if asset_key in self._materialized_assets:\n raise DagsterPipesError(\n f"Calling `report_asset_materialization` with asset key `{asset_key}` is undefined."\n " Asset has already been materialized, so no additional data can be reported"\n " for it."\n )\n metadata = (\n _normalize_param_metadata(metadata, "report_asset_materialization", "metadata")\n if metadata\n else None\n )\n data_version = _assert_opt_param_type(\n data_version, str, "report_asset_materialization", "data_version"\n )\n self._write_message(\n "report_asset_materialization",\n {"asset_key": asset_key, "data_version": data_version, "metadata": metadata},\n )\n self._materialized_assets.add(asset_key)
\n\n
[docs] def report_asset_check(\n self,\n check_name: str,\n passed: bool,\n severity: PipesAssetCheckSeverity = "ERROR",\n metadata: Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]] = None,\n asset_key: Optional[str] = None,\n ) -> None:\n """Report to Dagster that an asset check has been performed. Streams a payload containing\n check result information back to Dagster. If no assets or associated checks are in scope, raises an error.\n\n Args:\n check_name (str): The name of the check.\n passed (bool): Whether the check passed.\n severity (PipesAssetCheckSeverity): The severity of the check. Defaults to "ERROR".\n metadata (Optional[Mapping[str, Union[PipesMetadataRawValue, PipesMetadataValue]]]):\n Metadata for the check. Defaults to None.\n asset_key (Optional[str]): The asset key for the check. If only a single asset is in\n scope, default to that asset's key. If multiple assets are in scope, this must be\n set explicitly or an error will be raised.\n """\n asset_key = _resolve_optionally_passed_asset_key(\n self._data, asset_key, "report_asset_check"\n )\n check_name = _assert_param_type(check_name, str, "report_asset_check", "check_name")\n passed = _assert_param_type(passed, bool, "report_asset_check", "passed")\n metadata = (\n _normalize_param_metadata(metadata, "report_asset_check", "metadata")\n if metadata\n else None\n )\n self._write_message(\n "report_asset_check",\n {\n "asset_key": asset_key,\n "check_name": check_name,\n "passed": passed,\n "metadata": metadata,\n "severity": severity,\n },\n )
\n\n @property\n def log(self) -> logging.Logger:\n """logging.Logger: A logger that streams log messages back to Dagster."""\n return self._logger
\n
", "current_page_name": "_modules/dagster_pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pipes"}, "index": {"alabaster_version": "0.7.13", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "favicon_url": null, "logo_url": null, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "Overview: module code"}}, "dagster": {"_config": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_schema

\nfrom typing import TYPE_CHECKING, Any, Dict, List, Mapping, Sequence, Type, Union\n\nfrom typing_extensions import TypeAlias\n\nif TYPE_CHECKING:\n    from dagster._config import ConfigType, Field\n\n# Eventually, the below `UserConfigSchema` should be renamed to `ConfigSchema` and the class\n# definition should be dropped. The reason we don't do this now is that sphinx autodoc doesn't\n# support type aliases, so there is no good way to gracefully attach a docstring to this and have it\n# show up in the docs. See: https://github.com/sphinx-doc/sphinx/issues/8934\n#\n# Unfortunately mypy doesn't support recursive types, which would be used to properly define the\n# List/Dict elements of this union: `Dict[str, ConfigSchema]`, `List[ConfigSchema]`.\nUserConfigSchema: TypeAlias = Union[\n    Type[Union[bool, float, int, str]],\n    Type[Union[Dict[Any, Any], List[Any]]],\n    "ConfigType",\n    "Field",\n    Mapping[str, Any],\n    Sequence[Any],\n]\n\n\n
[docs]class ConfigSchema:\n """Placeholder type for config schemas.\n\n Any time that it appears in documentation, it means that any of the following types are\n acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/_config/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_schema"}, "config_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import TYPE_CHECKING, Dict, Iterator, Optional, Sequence, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from .snap import ConfigSchemaSnapshot, ConfigTypeSnap\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """The class backing DagsterTypes as they are used processing configuration data."""\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[Sequence["ConfigType"]] = None,\n    ):\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[Sequence[ConfigType]] = (\n            check.sequence_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n        # memoized snap representation\n        self._snap: Optional["ConfigTypeSnap"] = None\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n    def get_snapshot(self) -> "ConfigTypeSnap":\n        from .snap import snap_from_config_type\n\n        if self._snap is None:\n            self._snap = snap_from_config_type(self)\n\n        return self._snap\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        yield self\n\n    def get_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n        from .snap import ConfigSchemaSnapshot\n\n        return ConfigSchemaSnapshot({ct.key: ct.get_snapshot() for ct in self.type_iterator()})\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self,\n        key: str,\n        given_name: Optional[str],\n        scalar_kind: ConfigScalarKind,\n        **kwargs: typing.Any,\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Noneable, self).__init__(\n key=f"Noneable.{self.inner_type.key}",\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Array, self).__init__(\n key=f"Array.{self.inner_type.key}",\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @public\n @property\n def description(self) -> str:\n """A human-readable description of this Array type."""\n return f"List of {self.key}"\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: Sequence[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.sequence_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(f"Should never reach this. config_value should be pre-validated. Got {value}")\n\n @classmethod\n def from_python_enum(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])\n\n @classmethod\n def from_python_enum_direct_values(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum, where the direct values are passed instead of symbolic values (IE, enum.symbol.value as opposed to enum.symbol).\n\n This is necessary for internal usage, as the symbolic values are not serializable.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED.value\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v.value) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self,\n scalar_type: typing.Any,\n non_scalar_schema: UserConfigSchema,\n _key: Optional[str] = None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = check.inst(\n cast(ConfigType, resolve_to_config_type(scalar_type)), ConfigType\n )\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", f"ScalarUnion.{self.scalar_type.key}-{self.non_scalar_type.key}"\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.scalar_type.type_iterator()\n yield from self.non_scalar_type.type_iterator()\n yield from super().type_iterator()
\n\n\nConfigAnyInstance: Any = Any()\nConfigBoolInstance: Bool = Bool()\nConfigFloatInstance: Float = Float()\nConfigIntInstance: Int = Int()\nConfigStringInstance: String = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed(f"Scalar {type_name} is not supported")\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/_config/config_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_type"}, "field": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field

\nfrom typing import Any, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster._serdes import serialize_value\nfrom dagster._seven import is_subclass\nfrom dagster._utils import is_enum_value\nfrom dagster._utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj) -> bool:\n    return isinstance(obj, type) and is_subclass(obj, ConfigType)\n\n\ndef helpful_list_error_string() -> str:\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(obj: Union[ConfigType, UserConfigSchema]) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(obj, ConfigType):\n        return obj\n\n    if isinstance(obj, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(obj) == 1:\n            key = next(iter(obj.keys()))\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid key in map specification: {key!r} in map {obj}"\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:  # type: ignore\n                    raise DagsterInvalidDefinitionError(\n                        f"Non-scalar key in map specification: {key!r} in map {obj}"\n                    )\n\n                inner_type = resolve_to_config_type(obj[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid value in map specification: {obj[str]!r} in map {obj}"\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(obj)\n\n    if isinstance(obj, list):\n        if len(obj) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(obj[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                f"Invalid member of array specification: {obj[0]!r} in list {obj}"\n            )\n        return Array(inner_type)\n\n    if BuiltinEnum.contains(obj):\n        return ConfigType.from_builtin_enum(obj)\n\n    from .primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if is_supported_config_python_builtin(obj):\n        return remap_python_builtin_for_config(obj)\n\n    if obj is None:\n        return ConfigAnyInstance\n\n    # Special error messages for passing a DagsterType\n    from dagster._core.types.dagster_type import DagsterType, List, ListType\n    from dagster._core.types.python_set import Set, _TypedPythonSet\n    from dagster._core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(obj):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {obj} to resolve_to_config_type. This error usually"\n            " occurs when you pass a dagster config type class instead of a class instance into"\n            ' another dagster config type. E.g. "Noneable(Permissive)" should instead be'\n            ' "Noneable(Permissive())".',\n        )\n\n    if isinstance(obj, type) and is_subclass(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed a DagsterType class {obj!r} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}"\n        )\n\n    if is_closed_python_optional_type(obj):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(obj):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed in {obj} to the config system. Types from "\n            "the typing module in python are not allowed in the config system. "\n            "You must use types that are imported from dagster or primitive types "\n            "such as bool, int, etc."\n        )\n\n    if obj is List or isinstance(obj, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if obj is Set or isinstance(obj, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if obj is Tuple or isinstance(obj, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed an instance of DagsterType {obj.display_name} to the config "\n            f"system (Repr of type: {obj!r}). "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}",\n        )\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders for custom, and on other pluggable components of the system, such as resources, loggers,\n and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n f"Attempted to pass {config!r} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n )\n return config_type\n\n def __init__(\n self,\n config: Any,\n default_value: Any = FIELD_NO_DEFAULT_PROVIDED,\n is_required: Optional[bool] = None,\n description: Optional[str] = None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self._description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values],\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @public\n @property\n def is_required(self) -> bool:\n """Whether a value for this field must be provided at runtime.\n\n Cannot be True if a default value is provided.\n """\n return self._is_required\n\n @public\n @property\n def default_provided(self) -> bool:\n """Was a default value provided.\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @public\n @property\n def default_value(self) -> Any:\n """The default value for the field.\n\n Raises an exception if no default value was provided.\n """\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of this config field, if provided."""\n return self._description\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self) -> str:\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default=(\n "@" if self._default_value == FIELD_NO_DEFAULT_PROVIDED else self._default_value\n ),\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj: object, param_name: str) -> Optional[Field]:\n return check.opt_inst_param(cast(Optional[Field], obj), param_name, Field)\n
", "current_page_name": "_modules/dagster/_config/field", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field"}, "field_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field_utils

\n# encoding: utf-8\nimport hashlib\nimport os\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster._config import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        for field in self.fields.values():\n            yield from field.config_type.type_iterator()\n        yield from super().type_iterator()\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n    defined_cls_inst._initialized = False  # noqa: SLF001\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef compute_fields_hash(fields, description, field_aliases=None):\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"foo": "bar"} means that someone could use "bar" instead of "foo" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n # if we hit in the field cache - skip double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )\n self._initialized = True
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @public\n @property\n def key_label_name(self) -> Optional[str]:\n """Name which describes the role of keys in the map, if provided."""\n return self.given_name\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.key_type.type_iterator()\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )\n self._initialized = True
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )\n self._initialized = True
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Mapping[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Mapping[str, object]) -> Mapping[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Mapping[str, "Field"]:\n check.mapping_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: Sequence[object], stack: List[str]) -> Array:\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Mapping[object, object], stack: List[str]) -> Map:\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = next(iter(the_dict.keys()))\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n f"Map dict must have a scalar type as its only key. Got key {key!r}",\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}"\n .format(repr(the_dict[key])),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, Mapping):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = next(iter(potential_type.keys()))\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n\n\ndef config_dictionary_from_values(\n values: Mapping[str, Any], config_field: "Field"\n) -> Dict[str, Any]:\n """Converts a set of config values into a dictionary representation,\n in particular converting EnvVar objects into Dagster config inputs\n and processing data structures such as dicts, lists, and structured Config classes.\n """\n assert ConfigTypeKind.is_shape(config_field.config_type.kind)\n\n from dagster._config.pythonic_config import _config_value_to_dict_representation\n\n return check.is_dict(_config_value_to_dict_representation(None, values))\n\n\ndef _create_direct_access_exception(cls: Type, env_var_name: str) -> Exception:\n return RuntimeError(\n f'Attempted to directly retrieve environment variable {cls.__name__}("{env_var_name}").'\n f" {cls.__name__} defers resolution of the environment variable value until run time, and"\n " should only be used as input to Dagster config or resources.\\n\\nTo access the"\n f" environment variable value, call `get_value` on the {cls.__name__}, or use os.getenv"\n " directly."\n )\n\n\nclass IntEnvVar(int):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to an int value when the config is\n loaded.\n """\n\n name: str\n\n @classmethod\n def create(cls, name: str) -> "IntEnvVar":\n var = IntEnvVar(0)\n var.name = name\n return var\n\n def __int__(self) -> int:\n """Raises an exception of the EnvVar value is directly accessed. Users should instead use\n the `get_value` method, or use the EnvVar as an input to Dagster config or resources.\n """\n raise _create_direct_access_exception(self.__class__, self.env_var_name)\n\n def __str__(self) -> str:\n return str(int(self))\n\n def get_value(self, default: Optional[int] = None) -> Optional[int]:\n """Returns the value of the environment variable, or the default value if the\n environment variable is not set. If no default is provided, None will be returned.\n """\n value = os.getenv(self.name, default=default)\n return int(value) if value else None\n\n @property\n def env_var_name(self) -> str:\n """Returns the name of the environment variable."""\n return self.name\n\n\nclass EnvVar(str):\n """Class used to represent an environment variable in the Dagster config system.\n\n This class is intended to be used to populate config fields or resources.\n The environment variable will be resolved to a string value when the config is\n loaded.\n\n To access the value of the environment variable, use the `get_value` method.\n """\n\n @classmethod\n def int(cls, name: str) -> "IntEnvVar":\n return IntEnvVar.create(name=name)\n\n def __str__(self) -> str:\n """Raises an exception of the EnvVar value is directly accessed. Users should instead use\n the `get_value` method, or use the EnvVar as an input to Dagster config or resources.\n """\n raise _create_direct_access_exception(self.__class__, self.env_var_name)\n\n @property\n def env_var_name(self) -> str:\n """Returns the name of the environment variable."""\n return super().__str__()\n\n def get_value(self, default: Optional[str] = None) -> Optional[str]:\n """Returns the value of the environment variable, or the default value if the\n environment variable is not set. If no default is provided, None will be returned.\n """\n return os.getenv(self.env_var_name, default=default)\n
", "current_page_name": "_modules/dagster/_config/field_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field_utils"}, "pythonic_config": {"config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.config

\nimport re\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Dict,\n    List,\n    Mapping,\n    Optional,\n    Set,\n    Type,\n    cast,\n)\n\nfrom pydantic import BaseModel\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster import (\n    Field as DagsterField,\n    Shape,\n)\nfrom dagster._config.field_utils import (\n    EnvVar,\n    IntEnvVar,\n    Permissive,\n)\nfrom dagster._core.definitions.definition_config_schema import (\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidConfigDefinitionError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPythonicConfigDefinitionError,\n)\nfrom dagster._utils.cached_method import CACHED_METHOD_FIELD_SUFFIX\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .conversion_utils import _convert_pydantic_field, safe_is_subclass\nfrom .pydantic_compat_layer import (\n    USING_PYDANTIC_2,\n    ModelFieldCompat,\n    model_config,\n    model_fields,\n)\nfrom .typing_utils import BaseConfigMeta\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nINTERNAL_MARKER = "__internal__"\n\n# ensure that this ends with the internal marker so we can do a single check\nassert CACHED_METHOD_FIELD_SUFFIX.endswith(INTERNAL_MARKER)\n\n\ndef _is_frozen_pydantic_error(e: Exception) -> bool:\n    """Parses an error to determine if it is a Pydantic error indicating that the instance\n    is immutable. We use this to attach a more helpful error message.\n    """\n    return "Instance is frozen" in str(  # Pydantic 2.x error\n        e\n    ) or "is immutable and does not support item assignment" in str(  # Pydantic 1.x error\n        e\n    )\n\n\nclass MakeConfigCacheable(BaseModel):\n    """This class centralizes and implements all the chicanery we need in order\n    to support caching decorators. If we decide this is a bad idea we can remove it\n    all in one go.\n    """\n\n    # Pydantic config for this class\n    # Cannot use kwargs for base class as this is not support for pydnatic<1.8\n    class Config:\n        # Various pydantic model config (https://docs.pydantic.dev/usage/model_config/)\n        # Necessary to allow for caching decorators\n        arbitrary_types_allowed = True\n        # Avoid pydantic reading a cached property class as part of the schema\n        if USING_PYDANTIC_2:\n            ignored_types = (cached_property,)\n        else:\n            keep_untouched = (cached_property,)\n        # Ensure the class is serializable, for caching purposes\n        frozen = True\n\n    def __setattr__(self, name: str, value: Any):\n        from .resource import ConfigurableResourceFactory\n\n        # This is a hack to allow us to set attributes on the class that are not part of the\n        # config schema. Pydantic will normally raise an error if you try to set an attribute\n        # that is not part of the schema.\n\n        if self._is_field_internal(name):\n            object.__setattr__(self, name, value)\n            return\n\n        try:\n            return super().__setattr__(name, value)\n        except (TypeError, ValueError) as e:\n            clsname = self.__class__.__name__\n            if _is_frozen_pydantic_error(e):\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support item assignment,"\n                        " as it inherits from 'pydantic.BaseModel' with frozen=True. If trying to"\n                        " maintain state on this resource, consider building a separate, stateful"\n                        " client class, and provide a method on the resource to construct and"\n                        " return the stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support item"\n                        " assignment, as it inherits from 'pydantic.BaseModel' with frozen=True."\n                    ) from e\n            elif "object has no field" in str(e):\n                field_name = check.not_none(\n                    re.search(r"object has no field \\"(.*)\\"", str(e))\n                ).group(1)\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\". If trying to maintain"\n                        " state on this resource, consider building a separate, stateful client"\n                        " class, and provide a method on the resource to construct and return the"\n                        " stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\"."\n                    ) from e\n            else:\n                raise\n\n    def _is_field_internal(self, name: str) -> bool:\n        return name.endswith(INTERNAL_MARKER)\n\n\nT = TypeVar("T")\n\n\ndef ensure_env_vars_set_post_init(set_value: T, input_value: Any) -> T:\n    """Pydantic 2.x utility. Ensures that Pydantic field values are set to the appropriate\n    EnvVar or IntEnvVar objects post-model-instantiation, since Pydantic 2.x will cast\n    EnvVar or IntEnvVar values to raw strings or ints as part of the model instantiation process.\n    """\n    if isinstance(set_value, dict) and isinstance(input_value, dict):\n        for key, value in input_value.items():\n            if isinstance(value, (EnvVar, IntEnvVar)):\n                set_value[key] = value\n            elif isinstance(value, (dict, list)):\n                set_value[key] = ensure_env_vars_set_post_init(set_value[key], value)\n    if isinstance(set_value, List) and isinstance(input_value, List):\n        for i in range(len(set_value)):\n            value = input_value[i]\n            if isinstance(value, (EnvVar, IntEnvVar)):\n                set_value[i] = value\n            elif isinstance(value, (dict, list)):\n                set_value[i] = ensure_env_vars_set_post_init(set_value[i], value)\n\n    return set_value\n\n\n
[docs]class Config(MakeConfigCacheable, metaclass=BaseConfigMeta):\n """Base class for Dagster configuration models, used to specify config schema for\n ops and assets. Subclasses :py:class:`pydantic.BaseModel`.\n\n Example definition:\n\n .. code-block:: python\n\n from pydantic import Field\n\n class MyAssetConfig(Config):\n my_str: str = "my_default_string"\n my_int_list: List[int]\n my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_with_config(config: MyAssetConfig):\n assert config.my_str == "my_default_string"\n assert config.my_int_list == [1, 2, 3]\n assert config.my_bool_with_metadata == False\n\n asset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n\n """\n\n def __init__(self, **config_dict) -> None:\n """This constructor is overridden to handle any remapping of raw config dicts to\n the appropriate config classes. For example, discriminated unions are represented\n in Dagster config as dicts with a single key, which is the discriminator value.\n """\n modified_data = {}\n for key, value in config_dict.items():\n field = model_fields(self).get(key)\n\n # This is useful in Pydantic 2.x when reconstructing a config object from a dict\n # e.g. when instantiating a resource at runtime from its config dict\n # In Pydantic 1.x, this is a no-op, since a non-required field without a\n # value provided will default to None (required & optional are the same in 1.x)\n if field and not field.is_required() and value is None:\n continue\n\n if field and field.discriminator:\n nested_dict = value\n\n discriminator_key = check.not_none(field.discriminator)\n if isinstance(value, Config):\n nested_dict = _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key,\n value._get_non_none_public_field_values(), # noqa: SLF001\n )\n\n nested_items = list(check.is_dict(nested_dict).items())\n check.invariant(\n len(nested_items) == 1,\n "Discriminated union must have exactly one key",\n )\n discriminated_value, nested_values = nested_items[0]\n\n modified_data[key] = {\n **nested_values,\n discriminator_key: discriminated_value,\n }\n else:\n modified_data[key] = value\n\n for key, field in model_fields(self).items():\n if field.is_required() and key not in modified_data:\n modified_data[key] = None\n\n super().__init__(**modified_data)\n if USING_PYDANTIC_2:\n self.__dict__ = ensure_env_vars_set_post_init(self.__dict__, modified_data)\n\n def _convert_to_config_dictionary(self) -> Mapping[str, Any]:\n """Converts this Config object to a Dagster config dictionary, in the same format as the dictionary\n accepted as run config or as YAML in the launchpad.\n\n Inner fields are recursively converted to dictionaries, meaning nested config objects\n or EnvVars will be converted to the appropriate dictionary representation.\n """\n public_fields = self._get_non_none_public_field_values()\n return {\n k: _config_value_to_dict_representation(model_fields(self).get(k), v)\n for k, v in public_fields.items()\n }\n\n def _get_non_none_public_field_values(self) -> Mapping[str, Any]:\n """Returns a dictionary representation of this config object,\n ignoring any private fields, and any optional fields that are None.\n\n Inner fields are returned as-is in the dictionary,\n meaning any nested config objects will be returned as config objects, not dictionaries.\n """\n output = {}\n for key, value in self.__dict__.items():\n if self._is_field_internal(key):\n continue\n field = model_fields(self).get(key)\n\n if field:\n resolved_field_name = field.alias or key\n output[resolved_field_name] = value\n else:\n output[key] = value\n return output\n\n @classmethod\n def to_config_schema(cls) -> DefinitionConfigSchema:\n """Converts the config structure represented by this class into a DefinitionConfigSchema."""\n return DefinitionConfigSchema(infer_schema_from_config_class(cls))\n\n @classmethod\n def to_fields_dict(cls) -> Dict[str, DagsterField]:\n """Converts the config structure represented by this class into a dictionary of dagster.Fields.\n This is useful when interacting with legacy code that expects a dictionary of fields but you\n want the source of truth to be a config class.\n """\n return cast(Shape, cls.to_config_schema().as_field().config_type).fields
\n\n\ndef _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key: str, config_dict: Mapping[str, Any]\n):\n """Remaps a config dictionary which is a member of a discriminated union to\n the appropriate structure for a Dagster config selector.\n\n A discriminated union with key "my_key" and value "my_value" will be represented\n as {"my_key": "my_value", "my_field": "my_field_value"}. When converted to a selector,\n this should be represented as {"my_value": {"my_field": "my_field_value"}}.\n """\n updated_dict = dict(config_dict)\n discriminator_value = updated_dict.pop(discriminator_key)\n wrapped_dict = {discriminator_value: updated_dict}\n return wrapped_dict\n\n\ndef _config_value_to_dict_representation(field: Optional[ModelFieldCompat], value: Any):\n """Converts a config value to a dictionary representation. If a field is provided, it will be used\n to determine the appropriate dictionary representation in the case of discriminated unions.\n """\n from dagster._config.field_utils import EnvVar, IntEnvVar\n\n if isinstance(value, dict):\n return {k: _config_value_to_dict_representation(None, v) for k, v in value.items()}\n elif isinstance(value, list):\n return [_config_value_to_dict_representation(None, v) for v in value]\n elif isinstance(value, EnvVar):\n return {"env": value.env_var_name}\n elif isinstance(value, IntEnvVar):\n return {"env": value.name}\n if isinstance(value, Config):\n if field and field.discriminator:\n return {\n k: v\n for k, v in _discriminated_union_config_dict_to_selector_config_dict(\n field.discriminator,\n value._convert_to_config_dictionary(), # noqa: SLF001\n ).items()\n }\n else:\n return {k: v for k, v in value._convert_to_config_dictionary().items()} # noqa: SLF001\n elif isinstance(value, Enum):\n return value.name\n\n return value\n\n\n
[docs]class PermissiveConfig(Config):\n """Subclass of :py:class:`Config` that allows arbitrary extra fields. This is useful for\n config classes which may have open-ended inputs.\n\n Example definition:\n\n .. code-block:: python\n\n class MyPermissiveOpConfig(PermissiveConfig):\n my_explicit_parameter: bool\n my_other_explicit_parameter: str\n\n\n Example usage:\n\n .. code-block:: python\n\n @op\n def op_with_config(config: MyPermissiveOpConfig):\n assert config.my_explicit_parameter == True\n assert config.my_other_explicit_parameter == "foo"\n assert config.dict().get("my_implicit_parameter") == "bar"\n\n op_with_config(\n MyPermissiveOpConfig(\n my_explicit_parameter=True,\n my_other_explicit_parameter="foo",\n my_implicit_parameter="bar"\n )\n )\n\n """\n\n # Pydantic config for this class\n # Cannot use kwargs for base class as this is not support for pydantic<1.8\n class Config:\n extra = "allow"
\n\n\ndef infer_schema_from_config_class(\n model_cls: Type["Config"],\n description: Optional[str] = None,\n fields_to_omit: Optional[Set[str]] = None,\n) -> DagsterField:\n from .config import Config\n from .resource import ConfigurableResourceFactory, _is_annotated_as_resource_type\n\n """Parses a structured config class and returns a corresponding Dagster config Field."""\n fields_to_omit = fields_to_omit or set()\n\n check.param_invariant(\n safe_is_subclass(model_cls, Config),\n "Config type annotation must inherit from dagster.Config",\n )\n\n fields: Dict[str, DagsterField] = {}\n for key, pydantic_field_info in model_fields(model_cls).items():\n if _is_annotated_as_resource_type(\n pydantic_field_info.annotation, pydantic_field_info.metadata\n ):\n continue\n\n resolved_field_name = pydantic_field_info.alias if pydantic_field_info.alias else key\n if key not in fields_to_omit:\n if isinstance(pydantic_field_info.default, DagsterField):\n raise DagsterInvalidDefinitionError(\n "Using 'dagster.Field' is not supported within a Pythonic config or resource"\n " definition. 'dagster.Field' should only be used in legacy Dagster config"\n " schemas. Did you mean to use 'pydantic.Field' instead?"\n )\n\n try:\n fields[resolved_field_name] = _convert_pydantic_field(pydantic_field_info)\n except DagsterInvalidConfigDefinitionError as e:\n raise DagsterInvalidPythonicConfigDefinitionError(\n config_class=model_cls,\n field_name=key,\n invalid_type=e.current_value,\n is_resource=model_cls is not None\n and safe_is_subclass(model_cls, ConfigurableResourceFactory),\n )\n\n shape_cls = Permissive if model_config(model_cls).get("extra") == "allow" else Shape\n\n docstring = model_cls.__doc__.strip() if model_cls.__doc__ else None\n\n return DagsterField(config=shape_cls(fields), description=description or docstring)\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.config"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.io_manager

\nfrom abc import abstractmethod\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Mapping,\n    Optional,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeVar\n\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n)\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n)\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .config import Config\nfrom .conversion_utils import TResValue\nfrom .resource import (\n    AllowDelayedDependencies,\n    ConfigurableResourceFactory,\n    PartialResource,\n    ResourceId,\n    ResourceWithKeyMapping,\n    Self,\n)\nfrom .type_check_utils import safe_is_subclass\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nTIOManagerValue = TypeVar("TIOManagerValue", bound=IOManager)\n\n\nclass ConfigurableIOManagerFactoryResourceDefinition(IOManagerDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        input_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        output_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        dagster_maintained: bool = False,\n    ):\n        input_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], input_config_schema).to_config_schema()\n            if safe_is_subclass(input_config_schema, Config)\n            else cast(CoercableToConfigSchema, input_config_schema)\n        )\n        output_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], output_config_schema).to_config_schema()\n            if safe_is_subclass(output_config_schema, Config)\n            else cast(CoercableToConfigSchema, output_config_schema)\n        )\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n            input_config_schema=input_config_schema_resolved,\n            output_config_schema=output_config_schema_resolved,\n        )\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._configurable_resource_cls = configurable_resource_cls\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n\nclass IOManagerWithKeyMapping(ResourceWithKeyMapping, IOManagerDefinition):\n    """Version of ResourceWithKeyMapping wrapper that also implements IOManagerDefinition."""\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        ResourceWithKeyMapping.__init__(self, resource, resource_id_to_key_mapping)\n        IOManagerDefinition.__init__(\n            self, resource_fn=self.resource_fn, config_schema=resource.config_schema\n        )\n\n\n
[docs]class ConfigurableIOManagerFactory(ConfigurableResourceFactory[TIOManagerValue]):\n """Base class for Dagster IO managers that utilize structured config. This base class\n is useful for cases in which the returned IO manager is not the same as the class itself\n (e.g. when it is a wrapper around the actual IO manager implementation).\n\n This class is a subclass of both :py:class:`IOManagerDefinition` and :py:class:`Config`.\n Implementers should provide an implementation of the :py:meth:`resource_function` method,\n which should return an instance of :py:class:`IOManager`.\n\n\n Example definition:\n\n .. code-block:: python\n\n class ExternalIOManager(IOManager):\n\n def __init__(self, connection):\n self._connection = connection\n\n def handle_output(self, context, obj):\n ...\n\n def load_input(self, context):\n ...\n\n class ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n username: str\n password: str\n\n def create_io_manager(self, context) -> IOManager:\n with database.connect(username, password) as connection:\n return MyExternalIOManager(connection)\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": ConfigurableExternalIOManager(\n username="dagster",\n password=EnvVar("DB_PASSWORD")\n )\n }\n )\n\n """\n\n def __init__(self, **data: Any):\n ConfigurableResourceFactory.__init__(self, **data)\n\n @abstractmethod\n def create_io_manager(self, context) -> TIOManagerValue:\n """Implement as one would implement a @io_manager decorator function."""\n raise NotImplementedError()\n\n def create_resource(self, context: InitResourceContext) -> TIOManagerValue:\n return self.create_io_manager(context)\n\n @classmethod\n def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialIOManager[Self]":\n """Returns a partially initialized copy of the IO manager, with remaining config fields\n set at runtime.\n """\n return PartialIOManager(cls, data=kwargs)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self._get_initialize_and_run_fn(),\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n input_config_schema=self.__class__.input_config_schema(),\n output_config_schema=self.__class__.output_config_schema(),\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n @classmethod\n def input_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None\n\n @classmethod\n def output_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None
\n\n\nclass PartialIOManager(Generic[TResValue], PartialResource[TResValue]):\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n PartialResource.__init__(self, resource_cls, data)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n input_config_schema = None\n output_config_schema = None\n if safe_is_subclass(self.resource_cls, ConfigurableIOManagerFactory):\n factory_cls: Type[ConfigurableIOManagerFactory] = cast(\n Type[ConfigurableIOManagerFactory], self.resource_cls\n )\n input_config_schema = factory_cls.input_config_schema()\n output_config_schema = factory_cls.output_config_schema()\n\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self._state__internal__.nested_resources,\n input_config_schema=input_config_schema,\n output_config_schema=output_config_schema,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\n
[docs]class ConfigurableIOManager(ConfigurableIOManagerFactory, IOManager):\n """Base class for Dagster IO managers that utilize structured config.\n\n This class is a subclass of both :py:class:`IOManagerDefinition`, :py:class:`Config`,\n and :py:class:`IOManager`. Implementers must provide an implementation of the\n :py:meth:`handle_output` and :py:meth:`load_input` methods.\n\n Example definition:\n\n .. code-block:: python\n\n class MyIOManager(ConfigurableIOManager):\n path_prefix: List[str]\n\n def _get_path(self, context) -> str:\n return "/".join(context.asset_key.path)\n\n def handle_output(self, context, obj):\n write_csv(self._get_path(context), obj)\n\n def load_input(self, context):\n return read_csv(self._get_path(context))\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n }\n )\n\n """\n\n def create_io_manager(self, context) -> IOManager:\n return self
\n\n\nclass ConfigurableLegacyIOManagerAdapter(ConfigurableIOManagerFactory):\n """Adapter base class for wrapping a decorated, function-style I/O manager\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_io_manager`` method.\n\n Example:\n .. code-block:: python\n\n class OldIOManager(IOManager):\n def __init__(self, base_path: str):\n ...\n\n @io_manager(config_schema={"base_path": str})\n def old_io_manager(context):\n base_path = context.resource_config["base_path"]\n\n return OldIOManager(base_path)\n\n class MyIOManager(ConfigurableLegacyIOManagerAdapter):\n base_path: str\n\n @property\n def wrapped_io_manager(self) -> IOManagerDefinition:\n return old_io_manager\n """\n\n @property\n @abstractmethod\n def wrapped_io_manager(self) -> IOManagerDefinition:\n raise NotImplementedError()\n\n def create_io_manager(self, context) -> IOManager:\n raise NotImplementedError(\n "Because we override resource_fn in the adapter, this is never called."\n )\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_io_manager.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.resource

\nimport contextlib\nimport inspect\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeGuard, get_args, get_origin\n\nfrom dagster import (\n    Field as DagsterField,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.field_utils import config_dictionary_from_values\nfrom dagster._config.pythonic_config.typing_utils import (\n    TypecheckAllowPartialResourceInitParams,\n)\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.context.init import InitResourceContext, build_init_resource_context\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .pydantic_compat_layer import (\n    model_fields,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nfrom abc import ABC, abstractmethod\n\nfrom pydantic import BaseModel\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n    has_at_least_one_parameter,\n)\nfrom dagster._core.storage.io_manager import IOManagerDefinition\n\nfrom .config import Config, MakeConfigCacheable, infer_schema_from_config_class\nfrom .conversion_utils import (\n    TResValue,\n    _curry_config_schema,\n)\nfrom .typing_utils import BaseResourceMeta, LateBoundTypesForResourceTypeChecking\n\nSelf = TypeVar("Self", bound="ConfigurableResourceFactory")\nResourceId: TypeAlias = int\n\n\nclass AllowDelayedDependencies:\n    _nested_partial_resources: Mapping[str, ResourceDefinition] = {}\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        # All dependent resources which are not fully configured\n        # must be specified to the Definitions object so that the\n        # resource can be configured at runtime by the user\n        nested_partial_resource_keys = {\n            attr_name: resource_mapping.get(id(resource_def))\n            for attr_name, resource_def in self._nested_partial_resources.items()\n        }\n        check.invariant(\n            all(pointer_key is not None for pointer_key in nested_partial_resource_keys.values()),\n            "Any partially configured, nested resources must be provided to Definitions"\n            f" object: {nested_partial_resource_keys}",\n        )\n\n        # Recursively get all nested resource keys\n        nested_resource_required_keys: Set[str] = set()\n        for v in self._nested_partial_resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(v, resource_mapping)\n            )\n\n        resources, _ = separate_resource_params(\n            cast(Type[BaseModel], self.__class__), self.__dict__\n        )\n        for v in resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(\n                    wrap_resource_for_execution(v), resource_mapping\n                )\n            )\n\n        out = set(cast(Set[str], nested_partial_resource_keys.values())).union(\n            nested_resource_required_keys\n        )\n        return out\n\n\nclass InitResourceContextWithKeyMapping(InitResourceContext):\n    """Passes along a mapping from ResourceDefinition id to resource key alongside the\n    InitResourceContext. This is used to resolve the required resource keys for\n    resources which may hold nested partial resources.\n    """\n\n    def __init__(\n        self,\n        context: InitResourceContext,\n        resource_id_to_key_mapping: Mapping[ResourceId, str],\n    ):\n        super().__init__(\n            resource_config=context.resource_config,\n            resources=context.resources,\n            instance=context.instance,\n            resource_def=context.resource_def,\n            dagster_run=context.dagster_run,\n            log_manager=context.log,\n        )\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n        self._resources_by_id = {\n            resource_id: getattr(context.resources, resource_key, None)\n            for resource_id, resource_key in resource_id_to_key_mapping.items()\n        }\n\n    @property\n    def resources_by_id(self) -> Mapping[ResourceId, Any]:\n        return self._resources_by_id\n\n    def replace_config(self, config: Any) -> "InitResourceContext":\n        return InitResourceContextWithKeyMapping(\n            super().replace_config(config), self._resource_id_to_key_mapping\n        )\n\n\nclass ResourceWithKeyMapping(ResourceDefinition):\n    """Wrapper around a ResourceDefinition which helps the inner resource resolve its required\n    resource keys. This is useful for resources which may hold nested resources. At construction\n    time, they are unaware of the resource keys of their nested resources - the resource id to\n    key mapping is used to resolve this.\n    """\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        self._resource = resource\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n\n        ResourceDefinition.__init__(\n            self,\n            resource_fn=self.setup_context_resources_and_call,\n            config_schema=resource.config_schema,\n            description=resource.description,\n            version=resource.version,\n        )\n\n    def setup_context_resources_and_call(self, context: InitResourceContext):\n        """Wrapper around the wrapped resource's resource_fn which attaches its\n        resource id to key mapping to the context, and then calls the nested resource's resource_fn.\n        """\n        context_with_key_mapping = InitResourceContextWithKeyMapping(\n            context, self._resource_id_to_key_mapping\n        )\n\n        if has_at_least_one_parameter(self._resource.resource_fn):\n            return self._resource.resource_fn(context_with_key_mapping)\n        else:\n            return cast(ResourceFunctionWithoutContext, self._resource.resource_fn)()\n\n    @property\n    def required_resource_keys(self) -> AbstractSet[str]:\n        return _resolve_required_resource_keys_for_resource(\n            self._resource, self._resource_id_to_key_mapping\n        )\n\n    @property\n    def wrapped_resource(self) -> ResourceDefinition:\n        return self._resource\n\n    @property\n    def inner_resource(self):\n        return self._resource\n\n\ndef attach_resource_id_to_key_mapping(\n    resource_def: Any, resource_id_to_key_mapping: Dict[ResourceId, str]\n) -> Any:\n    from .io_manager import IOManagerWithKeyMapping\n\n    if isinstance(resource_def, (ConfigurableResourceFactory, PartialResource)):\n        defn = resource_def.get_resource_definition()\n        return (\n            IOManagerWithKeyMapping(defn, resource_id_to_key_mapping)\n            if isinstance(defn, IOManagerDefinition)\n            else ResourceWithKeyMapping(defn, resource_id_to_key_mapping)\n        )\n    return resource_def\n\n\nCoercibleToResource: TypeAlias = Union[\n    ResourceDefinition, "ConfigurableResourceFactory", "PartialResource"\n]\n\n\ndef is_coercible_to_resource(val: Any) -> TypeGuard[CoercibleToResource]:\n    return isinstance(val, (ResourceDefinition, ConfigurableResourceFactory, PartialResource))\n\n\nclass ConfigurableResourceFactoryResourceDefinition(ResourceDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        dagster_maintained: bool = False,\n    ):\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n        )\n        self._configurable_resource_cls = configurable_resource_cls\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n    def _is_dagster_maintained(self) -> bool:\n        return self._dagster_maintained\n\n\nclass ConfigurableResourceFactoryState(NamedTuple):\n    nested_partial_resources: Mapping[str, Any]\n    resolved_config_dict: Dict[str, Any]\n    config_schema: DefinitionConfigSchema\n    schema: DagsterField\n    nested_resources: Dict[str, Any]\n    resource_context: Optional[InitResourceContext]\n\n\nclass ConfigurableResourceFactory(\n    Generic[TResValue],\n    Config,\n    TypecheckAllowPartialResourceInitParams,\n    AllowDelayedDependencies,\n    ABC,\n    metaclass=BaseResourceMeta,\n):\n    """Base class for creating and managing the lifecycle of Dagster resources that utilize structured config.\n\n    Users should directly inherit from this class when they want the object passed to user-defined\n    code (such as an asset or op) to be different than the object that defines the configuration\n    schema and is passed to the :py:class:`Definitions` object. Cases where this is useful include is\n    when the object passed to user code is:\n\n    * An existing class from a third-party library that the user does not control.\n    * A complex class that requires substantial internal state management or itself requires arguments beyond its config values.\n    * A class with expensive initialization that should not be invoked on code location load, but rather lazily on first use in an op or asset during a run.\n    * A class that you desire to be a plain Python class, rather than a Pydantic class, for whatever reason.\n\n    This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`, and\n    must implement ``create_resource``, which creates the resource to pass to user code.\n\n    Example definition:\n\n    .. code-block:: python\n\n        class DatabaseResource(ConfigurableResourceFactory[Database]):\n            connection_uri: str\n\n            def create_resource(self, _init_context) -> Database:\n                # For example Database could be from a third-party library or require expensive setup.\n                # Or you could just prefer to separate the concerns of configuration and runtime representation\n                return Database(self.connection_uri)\n\n    To use a resource created by a factory in a job, you must use the Resource type annotation.\n\n    Example usage:\n\n\n    .. code-block:: python\n\n        @asset\n        def asset_that_uses_database(database: ResourceParam[Database]):\n            # Database used directly in user code\n            database.query("SELECT * FROM table")\n\n        defs = Definitions(\n            assets=[asset_that_uses_database],\n            resources={"database": DatabaseResource(connection_uri="some_uri")},\n        )\n\n    """\n\n    def __init__(self, **data: Any):\n        resource_pointers, data_without_resources = separate_resource_params(self.__class__, data)\n\n        schema = infer_schema_from_config_class(\n            self.__class__, fields_to_omit=set(resource_pointers.keys())\n        )\n\n        # Populate config values\n        Config.__init__(self, **{**data_without_resources, **resource_pointers})\n\n        # We pull the values from the Pydantic config object, which may cast values\n        # to the correct type under the hood - useful in particular for enums\n        casted_data_without_resources = {\n            k: v\n            for k, v in self._convert_to_config_dictionary().items()\n            if k in data_without_resources\n        }\n        resolved_config_dict = config_dictionary_from_values(casted_data_without_resources, schema)\n\n        self._state__internal__ = ConfigurableResourceFactoryState(\n            # We keep track of any resources we depend on which are not fully configured\n            # so that we can retrieve them at runtime\n            nested_partial_resources={\n                k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n            },\n            resolved_config_dict=resolved_config_dict,\n            # These are unfortunately named very similarily\n            config_schema=_curry_config_schema(schema, resolved_config_dict),\n            schema=schema,\n            nested_resources={k: v for k, v in resource_pointers.items()},\n            resource_context=None,\n        )\n\n    @property\n    def _schema(self):\n        return self._state__internal__.schema\n\n    @property\n    def _config_schema(self):\n        return self._state__internal__.config_schema\n\n    @property\n    def _nested_partial_resources(self):\n        return self._state__internal__.nested_partial_resources\n\n    @property\n    def _nested_resources(self):\n        return self._state__internal__.nested_resources\n\n    @property\n    def _resolved_config_dict(self):\n        return self._state__internal__.resolved_config_dict\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        """This should be overridden to return True by all dagster maintained resources and IO managers."""\n        return False\n\n    @classmethod\n    def _is_cm_resource_cls(cls: Type["ConfigurableResourceFactory"]) -> bool:\n        return (\n            cls.yield_for_execution != ConfigurableResourceFactory.yield_for_execution\n            or cls.teardown_after_execution != ConfigurableResourceFactory.teardown_after_execution\n        )\n\n    @property\n    def _is_cm_resource(self) -> bool:\n        return self.__class__._is_cm_resource_cls()  # noqa: SLF001\n\n    def _get_initialize_and_run_fn(self) -> Callable:\n        return self._initialize_and_run_cm if self._is_cm_resource else self._initialize_and_run\n\n    @cached_method\n    def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n        return ConfigurableResourceFactoryResourceDefinition(\n            self.__class__,\n            resource_fn=self._get_initialize_and_run_fn(),\n            config_schema=self._config_schema,\n            description=self.__doc__,\n            resolve_resource_keys=self._resolve_required_resource_keys,\n            nested_resources=self.nested_resources,\n            dagster_maintained=self._is_dagster_maintained(),\n        )\n\n    @abstractmethod\n    def create_resource(self, context: InitResourceContext) -> TResValue:\n        """Returns the object that this resource hands to user code, accessible by ops or assets\n        through the context or resource parameters. This works like the function decorated\n        with @resource when using function-based resources.\n        """\n        raise NotImplementedError()\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    @classmethod\n    def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialResource[Self]":\n        """Returns a partially initialized copy of the resource, with remaining config fields\n        set at runtime.\n        """\n        return PartialResource(cls, data=kwargs)\n\n    def _with_updated_values(\n        self, values: Optional[Mapping[str, Any]]\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given values.\n        Used when initializing a resource at runtime.\n        """\n        values = check.opt_mapping_param(values, "values", key_type=str)\n        # Since Resource extends BaseModel and is a dataclass, we know that the\n        # signature of any __init__ method will always consist of the fields\n        # of this class. We can therefore safely pass in the values as kwargs.\n        out = self.__class__(**{**self._get_non_none_public_field_values(), **values})\n        out._state__internal__ = out._state__internal__._replace(  # noqa: SLF001\n            resource_context=self._state__internal__.resource_context\n        )\n        return out\n\n    @contextlib.contextmanager\n    def _resolve_and_update_nested_resources(\n        self, context: InitResourceContext\n    ) -> Generator["ConfigurableResourceFactory[TResValue]", None, None]:\n        """Updates any nested resources with the resource values from the context.\n        In this case, populating partially configured resources or\n        resources that return plain Python types.\n\n        Returns a new instance of the resource.\n        """\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        partial_resources_to_update: Dict[str, Any] = {}\n        if self._nested_partial_resources:\n            context_with_mapping = cast(\n                InitResourceContextWithKeyMapping,\n                check.inst(\n                    context,\n                    InitResourceContextWithKeyMapping,\n                    "This ConfiguredResource contains unresolved partially-specified nested"\n                    " resources, and so can only be initialized using a"\n                    " InitResourceContextWithKeyMapping",\n                ),\n            )\n            partial_resources_to_update = {\n                attr_name: context_with_mapping.resources_by_id[id(resource)]\n                for attr_name, resource in self._nested_partial_resources.items()\n            }\n\n        # Also evaluate any resources that are not partial\n        with contextlib.ExitStack() as stack:\n            resources_to_update, _ = separate_resource_params(self.__class__, self.__dict__)\n            resources_to_update = {\n                attr_name: _call_resource_fn_with_default(\n                    stack, wrap_resource_for_execution(resource), context\n                )\n                for attr_name, resource in resources_to_update.items()\n                if attr_name not in partial_resources_to_update\n            }\n\n            to_update = {**resources_to_update, **partial_resources_to_update}\n            yield self._with_updated_values(to_update)\n\n    @deprecated(\n        breaking_version="2.0", additional_warn_text="Use `with_replaced_resource_context` instead"\n    )\n    def with_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        return self.with_replaced_resource_context(resource_context)\n\n    def with_replaced_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given resource init context bound."""\n        # This utility is used to create a copy of this resource, without adjusting\n        # any values in this case\n        copy = self._with_updated_values({})\n        copy._state__internal__ = copy._state__internal__._replace(  # noqa: SLF001\n            resource_context=resource_context\n        )\n        return copy\n\n    def _initialize_and_run(self, context: InitResourceContext) -> TResValue:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            updated_resource.setup_for_execution(context)\n            return updated_resource.create_resource(context)\n\n    @contextlib.contextmanager\n    def _initialize_and_run_cm(\n        self, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            with updated_resource.yield_for_execution(context) as value:\n                yield value\n\n    def setup_for_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any pre-execution steps\n        needed before the resource is used in execution.\n        """\n        pass\n\n    def teardown_after_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any post-execution steps\n        needed after the resource is used in execution.\n\n        teardown_after_execution will be called even if any part of the run fails.\n        It will not be called if setup_for_execution fails.\n        """\n        pass\n\n    @contextlib.contextmanager\n    def yield_for_execution(self, context: InitResourceContext) -> Generator[TResValue, None, None]:\n        """Optionally override this method to perform any lifecycle steps\n        before or after the resource is used in execution. By default, calls\n        setup_for_execution before yielding, and teardown_after_execution after yielding.\n\n        Note that if you override this method and want setup_for_execution or\n        teardown_after_execution to be called, you must invoke them yourself.\n        """\n        self.setup_for_execution(context)\n        try:\n            yield self.create_resource(context)\n        finally:\n            self.teardown_after_execution(context)\n\n    def get_resource_context(self) -> InitResourceContext:\n        """Returns the context that this resource was initialized with."""\n        return check.not_none(\n            self._state__internal__.resource_context,\n            additional_message="Attempted to get context before resource was initialized.",\n        )\n\n    def process_config_and_initialize(self) -> TResValue:\n        """Initializes this resource, fully processing its config and returning the prepared\n        resource value.\n        """\n        from dagster._config.post_process import post_process_config\n\n        return self.from_resource_context(\n            build_init_resource_context(\n                config=post_process_config(\n                    self._config_schema.config_type, self._convert_to_config_dictionary()\n                ).value\n            )\n        )\n\n    @classmethod\n    def from_resource_context(cls, context: InitResourceContext) -> TResValue:\n        """Creates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes.\n\n        For resources that have custom teardown behavior, use from_resource_context_cm instead.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> MyResource:\n                return MyResource.from_resource_context(context)\n\n        """\n        check.invariant(\n            not cls._is_cm_resource_cls(),\n            "Use from_resource_context_cm for resources which have custom teardown behavior,"\n            " e.g. overriding yield_for_execution or teardown_after_execution",\n        )\n        return cls(**context.resource_config or {})._initialize_and_run(context)  # noqa: SLF001\n\n    @classmethod\n    @contextlib.contextmanager\n    def from_resource_context_cm(\n        cls, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        """Context which generates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes. Handles custom teardown behavior.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> Generator[MyResource, None, None]:\n                with MyResource.from_resource_context_cm(context) as my_resource:\n                    yield my_resource\n\n        """\n        with cls(**context.resource_config or {})._initialize_and_run_cm(  # noqa: SLF001\n            context\n        ) as value:\n            yield value\n\n\n
[docs]class ConfigurableResource(ConfigurableResourceFactory[TResValue]):\n """Base class for Dagster resources that utilize structured config.\n\n This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`.\n\n Example definition:\n\n .. code-block:: python\n\n class WriterResource(ConfigurableResource):\n prefix: str\n\n def output(self, text: str) -> None:\n print(f"{self.prefix}{text}")\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_that_uses_writer(writer: WriterResource):\n writer.output("text")\n\n defs = Definitions(\n assets=[asset_that_uses_writer],\n resources={"writer": WriterResource(prefix="a_prefix")},\n )\n\n """\n\n def create_resource(self, context: InitResourceContext) -> TResValue:\n """Returns the object that this resource hands to user code, accessible by ops or assets\n through the context or resource parameters. This works like the function decorated\n with @resource when using function-based resources.\n\n For ConfigurableResource, this function will return itself, passing\n the actual ConfigurableResource object to user code.\n """\n return cast(TResValue, self)
\n\n\ndef _is_fully_configured(resource: CoercibleToResource) -> bool:\n from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n actual_resource = wrap_resource_for_execution(resource)\n res = (\n validate_config(\n actual_resource.config_schema.config_type,\n (\n actual_resource.config_schema.default_value\n if actual_resource.config_schema.default_provided\n else {}\n ),\n ).success\n is True\n )\n\n return res\n\n\nclass PartialResourceState(NamedTuple):\n nested_partial_resources: Dict[str, Any]\n config_schema: DagsterField\n resource_fn: Callable[[InitResourceContext], Any]\n description: Optional[str]\n nested_resources: Dict[str, Any]\n\n\nclass PartialResource(Generic[TResValue], AllowDelayedDependencies, MakeConfigCacheable):\n data: Dict[str, Any]\n resource_cls: Type[ConfigurableResourceFactory[TResValue]]\n\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n resource_pointers, _data_without_resources = separate_resource_params(resource_cls, data)\n\n MakeConfigCacheable.__init__(self, data=data, resource_cls=resource_cls) # type: ignore # extends BaseModel, takes kwargs\n\n def resource_fn(context: InitResourceContext):\n instantiated = resource_cls(\n **{**data, **context.resource_config}\n ) # So that collisions are resolved in favor of the latest provided run config\n return instantiated._get_initialize_and_run_fn()(context) # noqa: SLF001\n\n self._state__internal__ = PartialResourceState(\n # We keep track of any resources we depend on which are not fully configured\n # so that we can retrieve them at runtime\n nested_partial_resources={\n k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n },\n config_schema=infer_schema_from_config_class(\n resource_cls, fields_to_omit=set(resource_pointers.keys())\n ),\n resource_fn=resource_fn,\n description=resource_cls.__doc__,\n nested_resources={k: v for k, v in resource_pointers.items()},\n )\n\n # to make AllowDelayedDependencies work\n @property\n def _nested_partial_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_partial_resources\n\n @property\n def nested_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_resources\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\nResourceOrPartial: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue], PartialResource[TResValue]\n]\nResourceOrPartialOrValue: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue],\n PartialResource[TResValue],\n ResourceDefinition,\n TResValue,\n]\n\n\nV = TypeVar("V")\n\n\nclass ResourceDependency(Generic[V]):\n def __set_name__(self, _owner, name):\n self._name = name\n\n def __get__(self, obj: "ConfigurableResourceFactory", __owner: Any) -> V:\n return getattr(obj, self._name)\n\n def __set__(self, obj: Optional[object], value: ResourceOrPartialOrValue[V]) -> None:\n setattr(obj, self._name, value)\n\n\nclass ConfigurableLegacyResourceAdapter(ConfigurableResource, ABC):\n """Adapter base class for wrapping a decorated, function-style resource\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_resource`` method.\n\n Example:\n .. code-block:: python\n\n @resource(config_schema={"prefix": str})\n def writer_resource(context):\n prefix = context.resource_config["prefix"]\n\n def output(text: str) -> None:\n out_txt.append(f"{prefix}{text}")\n\n return output\n\n class WriterResource(ConfigurableLegacyResourceAdapter):\n prefix: str\n\n @property\n def wrapped_resource(self) -> ResourceDefinition:\n return writer_resource\n """\n\n @property\n @abstractmethod\n def wrapped_resource(self) -> ResourceDefinition:\n raise NotImplementedError()\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_resource.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n def __call__(self, *args, **kwargs):\n return self.wrapped_resource(*args, **kwargs)\n\n\nclass SeparatedResourceParams(NamedTuple):\n resources: Dict[str, Any]\n non_resources: Dict[str, Any]\n\n\ndef _is_annotated_as_resource_type(annotation: Type, metadata: List[str]) -> bool:\n """Determines if a field in a structured config class is annotated as a resource type or not."""\n from .type_check_utils import safe_is_subclass\n\n if metadata and metadata[0] == "resource_dependency":\n return True\n\n is_annotated_as_resource_dependency = get_origin(annotation) == ResourceDependency or getattr(\n annotation, "__metadata__", None\n ) == ("resource_dependency",)\n\n return is_annotated_as_resource_dependency or safe_is_subclass(\n annotation, (ResourceDefinition, ConfigurableResourceFactory)\n )\n\n\nclass ResourceDataWithAnnotation(NamedTuple):\n key: str\n value: Any\n annotation: Type\n annotation_metadata: List[str]\n\n\ndef separate_resource_params(cls: Type[BaseModel], data: Dict[str, Any]) -> SeparatedResourceParams:\n """Separates out the key/value inputs of fields in a structured config Resource class which\n are marked as resources (ie, using ResourceDependency) from those which are not.\n """\n fields_by_resolved_field_name = {\n field.alias if field.alias else key: field for key, field in model_fields(cls).items()\n }\n data_with_annotation: List[ResourceDataWithAnnotation] = [\n # No longer exists in Pydantic 2.x, will need to be updated when we upgrade\n ResourceDataWithAnnotation(\n key=field_name,\n value=field_value,\n annotation=fields_by_resolved_field_name[field_name].annotation,\n annotation_metadata=fields_by_resolved_field_name[field_name].metadata,\n )\n for field_name, field_value in data.items()\n if field_name in fields_by_resolved_field_name\n ]\n # We need to grab metadata from the annotation in order to tell if\n # this key was annotated with a typing.Annotated annotation (which we use for resource/resource deps),\n # since Pydantic 2.0 strips that info out and sticks any Annotated metadata in the\n # metadata field\n out = SeparatedResourceParams(\n resources={\n d.key: d.value\n for d in data_with_annotation\n if _is_annotated_as_resource_type(\n d.annotation,\n d.annotation_metadata,\n )\n },\n non_resources={\n d.key: d.value\n for d in data_with_annotation\n if not _is_annotated_as_resource_type(\n d.annotation,\n d.annotation_metadata,\n )\n },\n )\n return out\n\n\ndef _call_resource_fn_with_default(\n stack: contextlib.ExitStack, obj: ResourceDefinition, context: InitResourceContext\n) -> Any:\n from dagster._config.validate import process_config\n\n if isinstance(obj.config_schema, ConfiguredDefinitionConfigSchema):\n value = cast(Dict[str, Any], obj.config_schema.resolve_config({}).value)\n context = context.replace_config(value["config"])\n elif obj.config_schema.default_provided:\n # To explain why we need to process config here;\n # - The resource available on the init context (context.resource_config) has already been processed\n # - The nested resource's config has also already been processed, but is only available in the broader run config dictionary.\n # - The only information we have access to here is the unprocessed default value, so we need to process it a second time.\n unprocessed_config = obj.config_schema.default_value\n evr = process_config(\n {"config": obj.config_schema.config_type}, {"config": unprocessed_config}\n )\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Error in config for nested resource ",\n evr.errors,\n unprocessed_config,\n )\n context = context.replace_config(cast(dict, evr.value)["config"])\n\n if has_at_least_one_parameter(obj.resource_fn):\n result = cast(ResourceFunctionWithContext, obj.resource_fn)(context)\n else:\n result = cast(ResourceFunctionWithoutContext, obj.resource_fn)()\n\n is_fn_generator = inspect.isgenerator(obj.resource_fn) or isinstance(\n obj.resource_fn, contextlib.ContextDecorator\n )\n if is_fn_generator:\n return stack.enter_context(cast(contextlib.AbstractContextManager, result))\n else:\n return result\n\n\nLateBoundTypesForResourceTypeChecking.set_actual_types_for_type_checking(\n resource_dep_type=ResourceDependency,\n resource_type=ConfigurableResourceFactory,\n partial_resource_type=PartialResource,\n)\n\n\ndef validate_resource_annotated_function(fn) -> None:\n """Validates any parameters on the decorated function that are annotated with\n :py:class:`dagster.ResourceDefinition`, raising a :py:class:`dagster.DagsterInvalidDefinitionError`\n if any are not also instances of :py:class:`dagster.ConfigurableResource` (these resources should\n instead be wrapped in the :py:func:`dagster.Resource` Annotation).\n """\n from dagster import DagsterInvalidDefinitionError\n from dagster._config.pythonic_config.resource import (\n ConfigurableResource,\n ConfigurableResourceFactory,\n TResValue,\n )\n\n from .type_check_utils import safe_is_subclass\n\n malformed_params = [\n param\n for param in get_function_params(fn)\n if safe_is_subclass(param.annotation, (ResourceDefinition, ConfigurableResourceFactory))\n and not safe_is_subclass(param.annotation, ConfigurableResource)\n ]\n if len(malformed_params) > 0:\n malformed_param = malformed_params[0]\n output_type = None\n if safe_is_subclass(malformed_param.annotation, ConfigurableResourceFactory):\n orig_bases = getattr(malformed_param.annotation, "__orig_bases__", None)\n output_type = get_args(orig_bases[0])[0] if orig_bases and len(orig_bases) > 0 else None\n if output_type == TResValue:\n output_type = None\n\n output_type_name = getattr(output_type, "__name__", str(output_type))\n raise DagsterInvalidDefinitionError(\n """Resource param '{param_name}' is annotated as '{annotation_type}', but '{annotation_type}' outputs {value_message} value to user code such as @ops and @assets. This annotation should instead be {annotation_suggestion}""".format(\n param_name=malformed_param.name,\n annotation_type=malformed_param.annotation,\n value_message=f"a '{output_type}'" if output_type else "an unknown",\n annotation_suggestion=(\n f"'ResourceParam[{output_type_name}]'"\n if output_type\n else "'ResourceParam[Any]' or 'ResourceParam[<output type>]'"\n ),\n )\n )\n\n\ndef _resolve_required_resource_keys_for_resource(\n resource: ResourceDefinition, resource_id_to_key_mapping: Mapping[ResourceId, str]\n) -> AbstractSet[str]:\n """Gets the required resource keys for the provided resource, with the assistance of the passed\n resource-id-to-key mapping. For resources which may hold nested partial resources,\n this mapping is used to obtain the top-level resource keys to depend on.\n """\n if isinstance(resource, AllowDelayedDependencies):\n return resource._resolve_required_resource_keys(resource_id_to_key_mapping) # noqa: SLF001\n return resource.required_resource_keys\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.resource"}}, "source": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.source

\nimport os\n\nimport dagster._check as check\n\nfrom .config_type import ScalarUnion\nfrom .errors import PostProcessingError\nfrom .field_utils import Selector\n\nVALID_STRING_SOURCE_TYPES = (str, dict)\n\n\ndef _ensure_env_variable(var):\n    check.str_param(var, "var")\n    value = os.getenv(var)\n    if value is None:\n        raise PostProcessingError(\n            f'You have attempted to fetch the environment variable "{var}" '\n            "which is not set. In order for this execution to succeed it "\n            "must be set in this environment."\n        )\n    return value\n\n\nclass StringSourceType(ScalarUnion):\n    def __init__(self):\n        super(StringSourceType, self).__init__(\n            scalar_type=str,\n            non_scalar_schema=Selector({"env": str}),\n            _key="StringSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, VALID_STRING_SOURCE_TYPES), "value")\n\n        if not isinstance(value, dict):\n            return value\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        return str(_ensure_env_variable(cfg))\n\n\nclass IntSourceType(ScalarUnion):\n    def __init__(self):\n        super(IntSourceType, self).__init__(\n            scalar_type=int,\n            non_scalar_schema=Selector({"env": str}),\n            _key="IntSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, int)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return int(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                f'Value "{value}" stored in env variable "{cfg}" cannot be coerced into an int.'\n            ) from e\n\n\nclass BoolSourceType(ScalarUnion):\n    def __init__(self):\n        super(BoolSourceType, self).__init__(\n            scalar_type=bool,\n            non_scalar_schema=Selector({"env": str}),\n            _key="BoolSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, bool)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return bool(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                (\n                    'Value "{value}" stored in env variable "{var}" cannot be coerced into an bool.'\n                ).format(value=value, var=cfg)\n            ) from e\n\n\nStringSource: StringSourceType = StringSourceType()\nIntSource: IntSourceType = IntSourceType()\nBoolSource: BoolSourceType = BoolSourceType()\n
", "current_page_name": "_modules/dagster/_config/source", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.source"}}, "_core": {"definitions": {"asset_check_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_result

\nfrom typing import TYPE_CHECKING, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationTargetMaterializationData,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.compute import StepExecutionContext\n\n\n
[docs]@experimental\nclass AssetCheckResult(\n NamedTuple(\n "_AssetCheckResult",\n [\n ("passed", PublicAttr[bool]),\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("check_name", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("severity", PublicAttr[AssetCheckSeverity]),\n ],\n )\n):\n """The result of an asset check.\n\n Attributes:\n asset_key (Optional[AssetKey]):\n The asset key that was checked.\n check_name (Optional[str]):\n The name of the check.\n passed (bool):\n The pass/fail result of the check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n severity (AssetCheckSeverity):\n Severity of the check. Defaults to ERROR.\n\n """\n\n def __new__(\n cls,\n *,\n passed: bool,\n asset_key: Optional[CoercibleToAssetKey] = None,\n check_name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n severity: AssetCheckSeverity = AssetCheckSeverity.ERROR,\n ):\n normalized_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n return super().__new__(\n cls,\n asset_key=AssetKey.from_coercible(asset_key) if asset_key is not None else None,\n check_name=check.opt_str_param(check_name, "check_name"),\n passed=check.bool_param(passed, "passed"),\n metadata=normalized_metadata,\n severity=check.inst_param(severity, "severity", AssetCheckSeverity),\n )\n\n def to_asset_check_evaluation(\n self, step_context: "StepExecutionContext"\n ) -> AssetCheckEvaluation:\n spec_check_names_by_asset_key = (\n step_context.job_def.asset_layer.get_check_names_by_asset_key_for_node_handle(\n step_context.node_handle.root\n )\n )\n\n asset_keys_with_specs = spec_check_names_by_asset_key.keys()\n\n if self.asset_key is not None:\n if self.asset_key not in asset_keys_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. It targets asset"\n f" '{self.asset_key.to_user_string()}' which is not targeted by any of the"\n " checks currently being evaluated. Targeted assets:"\n f" {[asset_key.to_user_string() for asset_key in asset_keys_with_specs]}."\n )\n\n resolved_asset_key = self.asset_key\n\n else:\n if len(spec_check_names_by_asset_key) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult didn't specify an asset key, but there are multiple assets"\n " to choose from:"\n f" {[asset_key.to_user_string() for asset_key in spec_check_names_by_asset_key.keys()]}"\n )\n\n resolved_asset_key = next(iter(asset_keys_with_specs))\n\n check_names_with_specs = spec_check_names_by_asset_key[resolved_asset_key]\n if self.check_name is not None:\n if self.check_name not in check_names_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. No checks currently being evaluated"\n f" target asset '{resolved_asset_key.to_user_string()}' and have name"\n f" '{self.check_name}'. Checks being evaluated for this asset:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = self.check_name\n else:\n if len(check_names_with_specs) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult result didn't specify a check name, but there are multiple"\n " checks to choose from for the this asset key:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = next(iter(check_names_with_specs))\n\n input_asset_info = step_context.get_input_asset_version_info(resolved_asset_key)\n if input_asset_info is not None:\n target_materialization_data = AssetCheckEvaluationTargetMaterializationData(\n run_id=input_asset_info.run_id,\n storage_id=input_asset_info.storage_id,\n timestamp=input_asset_info.timestamp,\n )\n else:\n target_materialization_data = None\n\n return AssetCheckEvaluation(\n check_name=resolved_check_name,\n asset_key=resolved_asset_key,\n passed=self.passed,\n metadata=self.metadata,\n target_materialization_data=target_materialization_data,\n severity=self.severity,\n )\n\n def get_spec_python_identifier(\n self, *, asset_key: Optional[AssetKey] = None, check_name: Optional[str] = None\n ) -> str:\n """Returns a string uniquely identifying the asset check spec associated with this result.\n This is used for the output name associated with an `AssetCheckResult`.\n """\n asset_key = asset_key or self.asset_key\n check_name = check_name or self.check_name\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n return f"{asset_key.to_python_identifier()}_{self.check_name}"
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_result"}, "asset_check_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._serdes.serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass AssetCheckSeverity(Enum):\n """Severity level for an asset check.\n\n Severities:\n\n - WARN: If the check fails, don't fail the step.\n - ERROR: If the check fails, fail the step and, within the run, skip materialization of any\n assets that are downstream of the asset being checked.\n """\n\n WARN = "WARN"\n ERROR = "ERROR"
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(old_storage_names={"AssetCheckHandle"})\nclass AssetCheckKey(NamedTuple):\n """Check names are expected to be unique per-asset. Thus, this combination of asset key and\n check name uniquely identifies an asset check within a deployment.\n """\n\n asset_key: PublicAttr[AssetKey]\n name: PublicAttr[str]\n\n @staticmethod\n def from_graphql_input(graphql_input: Mapping[str, Any]) -> "AssetCheckKey":\n return AssetCheckKey(\n asset_key=AssetKey.from_graphql_input(graphql_input["assetKey"]),\n name=graphql_input["name"],\n )
\n\n\n
[docs]@experimental\nclass AssetCheckSpec(\n NamedTuple(\n "_AssetCheckSpec",\n [\n ("name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines information about an asset check, except how to execute it.\n\n AssetCheckSpec is often used as an argument to decorators that decorator a function that can\n execute multiple checks - e.g. `@asset`, and `@multi_asset`. It defines one of the checks that\n will be executed inside that function.\n\n Args:\n name (str): Name of the check.\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The asset that\n the check applies to.\n description (Optional[str]): Description for the check.\n """\n\n def __new__(\n cls,\n name: str,\n *,\n asset: Union[CoercibleToAssetKey, "AssetsDefinition", "SourceAsset"],\n description: Optional[str] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n asset_key=AssetKey.from_coercible_or_definition(asset),\n description=check.opt_str_param(description, "description"),\n )\n\n def get_python_identifier(self) -> str:\n """Returns a string uniquely identifying the asset check, that uses only the characters\n allowed in a Python identifier.\n """\n return f"{self.asset_key.to_python_identifier()}_{self.name}"\n\n @property\n def key(self) -> AssetCheckKey:\n return AssetCheckKey(self.asset_key, self.name)
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_spec"}, "asset_dep": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_dep

\nfrom typing import NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_spec import AssetSpec\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\n\nCoercibleToAssetDep = Union[\n    CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset, "AssetDep"\n]\n\n\n
[docs]@experimental\nclass AssetDep(\n NamedTuple(\n "_AssetDep",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ],\n )\n):\n """Specifies a dependency on an upstream asset.\n\n Attributes:\n asset (Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]): The upstream asset to depend on.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided and the upstream asset is partitioned, defaults to\n the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n\n Examples:\n .. code-block:: python\n\n upstream_asset = AssetSpec("upstream_asset")\n downstream_asset = AssetSpec(\n "downstream_asset",\n deps=[\n AssetDep(\n upstream_asset,\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n )\n ]\n )\n """\n\n def __new__(\n cls,\n asset: Union[CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset],\n *,\n partition_mapping: Optional[PartitionMapping] = None,\n ):\n if isinstance(asset, list):\n check.list_param(asset, "asset", of_type=str)\n else:\n check.inst_param(\n asset, "asset", (AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset)\n )\n if isinstance(asset, AssetsDefinition) and len(asset.keys) > 1:\n # Only AssetsDefinition with a single asset can be passed\n raise DagsterInvalidDefinitionError(\n "Cannot create an AssetDep from a multi_asset AssetsDefinition."\n " Instead, specify dependencies on the assets created by the multi_asset"\n f" via AssetKeys or strings. For the multi_asset {asset.node_def.name}, the"\n f" available keys are: {asset.keys}."\n )\n\n asset_key = _get_asset_key(asset)\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n partition_mapping=check.opt_inst_param(\n partition_mapping,\n "partition_mapping",\n PartitionMapping,\n ),\n )\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetDep") -> "AssetDep":\n # if arg is AssetDep, return the original object to retain partition_mapping\n return arg if isinstance(arg, AssetDep) else AssetDep(asset=arg)
\n\n\ndef _get_asset_key(arg: "CoercibleToAssetDep") -> AssetKey:\n if isinstance(arg, (AssetsDefinition, SourceAsset, AssetSpec)):\n return arg.key\n elif isinstance(arg, AssetDep):\n return arg.asset_key\n else:\n return AssetKey.from_coercible(arg)\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_dep", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_dep"}, "asset_in": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_in

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\nfrom .partition_mapping import PartitionMapping\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[ArbitraryMetadataMapping]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ],\n )\n):\n """Defines an asset dependency.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the input name. Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the input.\n For example, if you only need a subset of columns from an upstream table, you could\n include that in metadata and the IO manager that loads the upstream table could use the\n metadata to determine which columns to load.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided, defaults to the default partition mapping for the\n partitions definition, which is typically maps partition keys to the same partition keys\n in upstream assets.\n dagster_type (DagsterType): Allows specifying type validation functions that\n will be executed on the input of the decorated function before it runs.\n """\n\n def __new__(\n cls,\n key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n input_manager_key: Optional[str] = None,\n partition_mapping: Optional[PartitionMapping] = None,\n dagster_type: Union[DagsterType, Type[NoValueSentinel]] = NoValueSentinel,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n check.invariant(\n not (key and key_prefix), "key and key_prefix cannot both be set on AssetIn"\n )\n\n return super(AssetIn, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n partition_mapping=check.opt_inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_in", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_in"}, "asset_out": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_out

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\n\n
[docs]class AssetOut(\n NamedTuple(\n "_AssetOut",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("io_manager_key", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ("backfill_policy", PublicAttr[Optional[BackfillPolicy]]),\n ],\n )\n):\n """Defines one of the assets produced by a :py:func:`@multi_asset <multi_asset>`.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name. When using ``@multi_asset``, the\n asset name defaults to the key of the "outs" dictionary Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IO manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code that generates this asset.\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n key: Optional[CoercibleToAssetKey] = None,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n return super(AssetOut, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy, "freshness_policy", FreshnessPolicy\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n )\n\n def to_out(self) -> Out:\n return Out(\n dagster_type=self.dagster_type,\n description=self.description,\n metadata=self.metadata,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n code_version=self.code_version,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_out", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_out"}, "asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_selection

\nimport collections.abc\nimport operator\nfrom abc import ABC, abstractmethod\nfrom functools import reduce\nfrom typing import AbstractSet, Iterable, Optional, Sequence, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._core.selector.subset_selector import (\n    fetch_connected,\n    fetch_sinks,\n    fetch_sources,\n    parse_clause,\n)\n\nfrom .asset_check_spec import AssetCheckKey\nfrom .asset_graph import AssetGraph, InternalAssetGraph\nfrom .assets import AssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n    key_prefix_from_coercible,\n)\nfrom .source_asset import SourceAsset\n\nCoercibleToAssetSelection: TypeAlias = Union[\n    str,\n    Sequence[str],\n    Sequence[AssetKey],\n    Sequence[Union["AssetsDefinition", "SourceAsset"]],\n    "AssetSelection",\n]\n\n\n
[docs]class AssetSelection(ABC):\n """An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.\n\n You can use the "|", "&", and "-" operators to create unions, intersections, and differences of selections, respectively.\n\n AssetSelections are typically used with :py:func:`define_asset_job`.\n\n By default, selecting assets will also select all of the asset checks that target those assets.\n\n Examples:\n .. code-block:: python\n\n # Select all assets in group "marketing":\n AssetSelection.groups("marketing")\n\n # Select all assets in group "marketing", as well as the asset with key "promotion":\n AssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n # Select all assets in group "marketing" that are downstream of asset "leads":\n AssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n # Select a list of assets:\n AssetSelection.assets(*my_assets_list)\n\n # Select all assets except for those in group "marketing"\n AssetSelection.all() - AssetSelection.groups("marketing")\n\n # Select all assets which are materialized by the same op as "projections":\n AssetSelection.keys("projections").required_multi_asset_neighbors()\n\n # Select all assets in group "marketing" and exclude their asset checks:\n AssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n # Select all asset checks that target a list of assets:\n AssetSelection.checks_for_assets(*my_assets_list)\n\n # Select a specific asset check:\n AssetSelection.checks(my_asset_check)\n\n """\n\n
[docs] @public\n @staticmethod\n def all() -> "AllSelection":\n """Returns a selection that includes all assets and asset checks."""\n return AllSelection()
\n\n
[docs] @public\n @staticmethod\n def all_asset_checks() -> "AllAssetCheckSelection":\n """Returns a selection that includes all asset checks."""\n return AllAssetCheckSelection()
\n\n
[docs] @public\n @staticmethod\n def assets(*assets_defs: AssetsDefinition) -> "KeysAssetSelection":\n """Returns a selection that includes all of the provided assets and asset checks that target them."""\n return KeysAssetSelection(*(key for assets_def in assets_defs for key in assets_def.keys))
\n\n
[docs] @public\n @staticmethod\n def keys(*asset_keys: CoercibleToAssetKey) -> "KeysAssetSelection":\n """Returns a selection that includes assets with any of the provided keys and all asset checks that target them.\n\n Examples:\n .. code-block:: python\n\n AssetSelection.keys(AssetKey(["a"]))\n\n AssetSelection.keys("a")\n\n AssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\n AssetSelection.keys("a", "b")\n\n asset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\n AssetSelection.keys(*asset_key_list)\n """\n _asset_keys = [\n AssetKey.from_user_string(key) if isinstance(key, str) else AssetKey.from_coercible(key)\n for key in asset_keys\n ]\n return KeysAssetSelection(*_asset_keys)
\n\n
[docs] @public\n @staticmethod\n def key_prefixes(\n *key_prefixes: CoercibleToAssetKeyPrefix, include_sources: bool = False\n ) -> "KeyPrefixesAssetSelection":\n """Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the key prefix(es)\n in the selection.\n\n Examples:\n .. code-block:: python\n\n # match any asset key where the first segment is equal to "a" or "b"\n # e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\n AssetSelection.key_prefixes("a", "b")\n\n # match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\n AssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n """\n _asset_key_prefixes = [key_prefix_from_coercible(key_prefix) for key_prefix in key_prefixes]\n return KeyPrefixesAssetSelection(*_asset_key_prefixes, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def groups(*group_strs, include_sources: bool = False) -> "GroupsAssetSelection":\n """Returns a selection that includes materializable assets that belong to any of the\n provided groups and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the group in the\n selection.\n """\n check.tuple_param(group_strs, "group_strs", of_type=str)\n return GroupsAssetSelection(*group_strs, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def checks_for_assets(*assets_defs: AssetsDefinition) -> "AssetChecksForAssetKeys":\n """Returns a selection with the asset checks that target the provided assets."""\n return AssetChecksForAssetKeys(\n [key for assets_def in assets_defs for key in assets_def.keys]\n )
\n\n
[docs] @public\n @staticmethod\n def checks(*asset_checks: AssetChecksDefinition) -> "AssetChecksForHandles":\n """Returns a selection that includes all of the provided asset checks."""\n return AssetChecksForHandles(\n [\n AssetCheckKey(asset_key=AssetKey.from_coercible(spec.asset_key), name=spec.name)\n for checks_def in asset_checks\n for spec in checks_def.specs\n ]\n )
\n\n
[docs] @public\n def downstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "DownstreamAssetSelection":\n """Returns a selection that includes all assets that are downstream of any of the assets in\n this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\n asset in this selection and returns the union of all downstream assets.\n\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are children or grandchildren of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each downstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return DownstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def upstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "UpstreamAssetSelection":\n """Returns a selection that includes all materializable assets that are upstream of any of\n the assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\n through each asset in this selection and returns the union of all upstream assets.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as upstream of regular assets.\n\n Args:\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are parents or grandparents of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each upstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return UpstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def sinks(self) -> "SinkAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the sink\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A sink asset is an asset that has no downstream dependencies within the asset selection.\n The sink asset can have downstream dependencies outside of the asset selection.\n """\n return SinkAssetSelection(self)
\n\n
[docs] @public\n def required_multi_asset_neighbors(self) -> "RequiredNeighborsAssetSelection":\n """Given an asset selection in which some assets are output from a multi-asset compute op\n which cannot be subset, returns a new asset selection that contains all of the assets\n required to execute the original asset selection. Includes the asset checks targeting the returned assets.\n """\n return RequiredNeighborsAssetSelection(self)
\n\n
[docs] @public\n def roots(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is an asset that has no upstream dependencies within the asset selection.\n The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return RootAssetSelection(self)
\n\n
[docs] @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use AssetSelection.roots instead.")\n def sources(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is a materializable asset that has no upstream dependencies within the asset\n selection. The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return self.roots()
\n\n
[docs] @public\n def upstream_source_assets(self) -> "SourceAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the source\n assets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.\n """\n return SourceAssetSelection(self)
\n\n
[docs] @public\n def without_checks(self) -> "AssetSelection":\n """Removes all asset checks in the selection."""\n return self - AssetSelection.all_asset_checks()
\n\n def __or__(self, other: "AssetSelection") -> "OrAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return OrAssetSelection(self, other)\n\n def __and__(self, other: "AssetSelection") -> "AndAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return AndAssetSelection(self, other)\n\n def __sub__(self, other: "AssetSelection") -> "SubAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return SubAssetSelection(self, other)\n\n def resolve(\n self, all_assets: Union[Iterable[Union[AssetsDefinition, SourceAsset]], AssetGraph]\n ) -> AbstractSet[AssetKey]:\n if isinstance(all_assets, AssetGraph):\n asset_graph = all_assets\n else:\n check.iterable_param(all_assets, "all_assets", (AssetsDefinition, SourceAsset))\n asset_graph = AssetGraph.from_assets(all_assets)\n\n resolved = self.resolve_inner(asset_graph)\n resolved_source_assets = asset_graph.source_asset_keys & resolved\n resolved_regular_assets = resolved - asset_graph.source_asset_keys\n check.invariant(\n not (len(resolved_source_assets) > 0 and len(resolved_regular_assets) > 0),\n "Asset selection specified both regular assets and source assets. This is not"\n " currently supported. Selections must be all regular assets or all source assets.",\n )\n return resolved\n\n @abstractmethod\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n raise NotImplementedError()\n\n def resolve_checks(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """We don't need this method currently, but it makes things consistent with resolve_inner. Currently\n we don't store checks in the ExternalAssetGraph, so we only support InternalAssetGraph.\n """\n return self.resolve_checks_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """By default, resolve to checks that target the selected assets. This is overriden for particular selections."""\n asset_keys = self.resolve(asset_graph)\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in asset_keys}\n\n @staticmethod\n def _selection_from_string(string: str) -> "AssetSelection":\n from dagster._core.definitions import AssetSelection\n\n if string == "*":\n return AssetSelection.all()\n\n parts = parse_clause(string)\n if not parts:\n check.failed(f"Invalid selection string: {string}")\n u, item, d = parts\n\n selection: AssetSelection = AssetSelection.keys(item)\n if u:\n selection = selection.upstream(u)\n if d:\n selection = selection.downstream(d)\n return selection\n\n @classmethod\n def from_coercible(cls, selection: CoercibleToAssetSelection) -> "AssetSelection":\n if isinstance(selection, str):\n return cls._selection_from_string(selection)\n elif isinstance(selection, AssetSelection):\n return selection\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, str) for el in selection\n ):\n return reduce(\n operator.or_, [cls._selection_from_string(cast(str, s)) for s in selection]\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in selection\n ):\n return AssetSelection.keys(\n *(\n key\n for el in selection\n for key in (\n el.keys if isinstance(el, AssetsDefinition) else [cast(SourceAsset, el).key]\n )\n )\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, AssetKey) for el in selection\n ):\n return cls.keys(*cast(Sequence[AssetKey], selection))\n else:\n check.failed(\n "selection argument must be one of str, Sequence[str], Sequence[AssetKey],"\n " Sequence[AssetsDefinition], Sequence[SourceAsset], AssetSelection. Was"\n f" {type(selection)}."\n )
\n\n\nclass AllSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return asset_graph.materializable_asset_keys\n\n\nclass AllAssetCheckSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return asset_graph.asset_check_keys\n\n\nclass AssetChecksForAssetKeys(AssetSelection):\n def __init__(self, keys: Sequence[AssetKey]):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in self._keys}\n\n\nclass AssetChecksForHandles(AssetSelection):\n def __init__(self, asset_check_keys: Sequence[AssetCheckKey]):\n self._asset_check_keys = asset_check_keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {\n handle for handle in asset_graph.asset_check_keys if handle in self._asset_check_keys\n }\n\n\nclass AndAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) & self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) & self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SubAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) - self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) - self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SinkAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sinks(asset_graph.asset_dep_graph, selection)\n\n\nclass RequiredNeighborsAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n output = set(selection)\n for asset_key in selection:\n output.update(asset_graph.get_required_multi_asset_keys(asset_key))\n return output\n\n\nclass RootAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sources(asset_graph.asset_dep_graph, selection)\n\n\nclass DownstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: Optional[bool] = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="downstream",\n depth=self.depth,\n )\n for asset_key in selection\n ],\n ),\n selection if not self.include_self else set(),\n )\n\n\nclass GroupsAssetSelection(AssetSelection):\n def __init__(self, *groups: str, include_sources: bool):\n self._groups = groups\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n asset_key\n for asset_key, group in asset_graph.group_names_by_key.items()\n if group in self._groups and asset_key in base_set\n }\n\n\nclass KeysAssetSelection(AssetSelection):\n def __init__(self, *keys: AssetKey):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n specified_keys = set(self._keys)\n invalid_keys = {key for key in specified_keys if key not in asset_graph.all_asset_keys}\n if invalid_keys:\n raise DagsterInvalidSubsetError(\n f"AssetKey(s) {invalid_keys} were selected, but no AssetsDefinition objects supply "\n "these keys. Make sure all keys are spelled correctly, and all AssetsDefinitions "\n "are correctly added to the `Definitions`."\n )\n return specified_keys\n\n\nclass KeyPrefixesAssetSelection(AssetSelection):\n def __init__(self, *key_prefixes: Sequence[str], include_sources: bool):\n self._key_prefixes = key_prefixes\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n key for key in base_set if any(key.has_prefix(prefix) for prefix in self._key_prefixes)\n }\n\n\nclass OrAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) | self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) | self._right.resolve_checks_inner(\n asset_graph\n )\n\n\ndef _fetch_all_upstream(\n selection: AbstractSet[AssetKey],\n asset_graph: AssetGraph,\n depth: Optional[int] = None,\n include_self: bool = True,\n) -> AbstractSet[AssetKey]:\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="upstream",\n depth=depth,\n )\n for asset_key in selection\n ],\n set(),\n ),\n selection if not include_self else set(),\n )\n\n\nclass UpstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: bool = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph, self.depth, self.include_self)\n return {key for key in all_upstream if key not in asset_graph.source_asset_keys}\n\n\nclass SourceAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph)\n return {key for key in all_upstream if key in asset_graph.source_asset_keys}\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_selection"}, "asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_sensor_definition

\nimport inspect\nfrom typing import Any, Callable, NamedTuple, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_annotation import get_resource_args\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    SensorDefinition,\n    SensorType,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\n\nclass AssetSensorParamNames(NamedTuple):\n    context_param_name: Optional[str]\n    event_log_entry_param_name: Optional[str]\n\n\ndef get_asset_sensor_param_names(fn: Callable) -> AssetSensorParamNames:\n    """Determines the names of the context and event log entry parameters for an asset sensor function.\n    These are assumed to be the first two non-resource params, in order (context param before event log entry).\n    """\n    resource_params = {param.name for param in get_resource_args(fn)}\n\n    non_resource_params = [\n        param.name for param in get_function_params(fn) if param.name not in resource_params\n    ]\n\n    context_param_name = non_resource_params[0] if len(non_resource_params) > 0 else None\n    event_log_entry_param_name = non_resource_params[1] if len(non_resource_params) > 1 else None\n\n    return AssetSensorParamNames(\n        context_param_name=context_param_name, event_log_entry_param_name=event_log_entry_param_name\n    )\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n job_name: Optional[str],\n asset_materialization_fn: Callable[\n ...,\n RawSensorEvaluationFunctionReturn,\n ],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn) -> Any:\n def _fn(context) -> Any:\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n yield SkipReason(\n f"No new materialization events found for asset key {self._asset_key}"\n )\n return\n\n event_record = event_records[0]\n\n (\n context_param_name,\n event_log_entry_param_name,\n ) = get_asset_sensor_param_names(materialization_fn)\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n # Build asset sensor function args, which can include any subset of\n # context arg, event log entry arg, and any resource args\n args = resource_args_populated\n if context_param_name:\n args[context_param_name] = context\n if event_log_entry_param_name:\n args[event_log_entry_param_name] = event_record.event_log_entry\n\n result = materialization_fn(**args)\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """AssetKey: The key of the asset targeted by this sensor."""\n return self._asset_key\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_sensor_definition"}, "asset_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .auto_materialize_policy import AutoMaterializePolicy\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .freshness_policy import FreshnessPolicy\nfrom .metadata import MetadataUserInput\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\n\n# SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE lives on the metadata of an asset\n# (which currently ends up on the Output associated with the asset key)\n# whih encodes the execution type the of asset. "Unexecutable" assets are assets\n# that cannot be materialized in Dagster, but can have events in the event\n# log keyed off of them, making Dagster usable as a observability and lineage tool\n# for externally materialized assets.\nSYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE = "dagster/asset_execution_type"\n\n\nclass AssetExecutionType(Enum):\n    OBSERVATION = "OBSERVATION"\n    UNEXECUTABLE = "UNEXECUTABLE"\n    MATERIALIZATION = "MATERIALIZATION"\n\n    @staticmethod\n    def is_executable(varietal_str: Optional[str]) -> bool:\n        return AssetExecutionType.str_to_enum(varietal_str) in {\n            AssetExecutionType.MATERIALIZATION,\n            AssetExecutionType.OBSERVATION,\n        }\n\n    @staticmethod\n    def str_to_enum(varietal_str: Optional[str]) -> "AssetExecutionType":\n        return (\n            AssetExecutionType.MATERIALIZATION\n            if varietal_str is None\n            else AssetExecutionType(varietal_str)\n        )\n\n\n
[docs]@experimental\nclass AssetSpec(\n NamedTuple(\n "_AssetSpec",\n [\n ("key", PublicAttr[AssetKey]),\n ("deps", PublicAttr[Iterable["AssetDep"]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("skippable", PublicAttr[bool]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ],\n )\n):\n """Specifies the core attributes of an asset. This object is attached to the decorated\n function that defines how it materialized.\n\n Attributes:\n key (AssetKey): The unique identifier for this asset.\n deps (Optional[AbstractSet[AssetKey]]): The asset keys for the upstream assets that\n materializing this asset depends on.\n description (Optional[str]): Human-readable description of this asset.\n metadata (Optional[Dict[str, Any]]): A dict of static metadata for this asset.\n For example, users can provide information about the database table this\n asset corresponds to.\n skippable (bool): Whether this asset can be omitted during materialization, causing downstream\n dependencies to skip.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code for this specific asset,\n overriding the code version of the materialization function\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key: CoercibleToAssetKey,\n *,\n deps: Optional[Iterable["CoercibleToAssetDep"]] = None,\n description: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n skippable: bool = False,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n ):\n from dagster._core.definitions.asset_dep import AssetDep\n\n dep_set = {}\n if deps:\n for dep in deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys.\n if asset_dep.asset_key in dep_set.keys():\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once for"\n f" AssetSpec {key}"\n )\n dep_set[asset_dep.asset_key] = asset_dep\n\n return super().__new__(\n cls,\n key=AssetKey.from_coercible(key),\n deps=list(dep_set.values()),\n description=check.opt_str_param(description, "description"),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n skippable=check.bool_param(skippable, "skippable"),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy,\n "freshness_policy",\n FreshnessPolicy,\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy,\n "auto_materialize_policy",\n AutoMaterializePolicy,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_spec"}, "assets": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.assets

\nimport hashlib\nimport json\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_layer import get_dep_node_handles_of_graph_backed_asset\nfrom dagster._core.definitions.asset_spec import AssetExecutionType\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.op_selection import get_graph_subset\nfrom dagster._core.definitions.partition_mapping import MultiPartitionMapping\nfrom dagster._core.definitions.resource_requirement import (\n    RequiresResources,\n    ResourceAddable,\n    ResourceRequirement,\n    merge_resource_defs,\n)\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom .dependency import NodeHandle\nfrom .events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom .node_definition import NodeDefinition\nfrom .op_definition import OpDefinition\nfrom .partition import PartitionsDefinition\nfrom .partition_mapping import (\n    PartitionMapping,\n    get_builtin_partition_mapping_types,\n    infer_partition_mapping,\n)\nfrom .resource_definition import ResourceDefinition\nfrom .source_asset import SourceAsset\nfrom .utils import DEFAULT_GROUP_NAME, validate_group_name\n\nif TYPE_CHECKING:\n    from .graph_definition import GraphDefinition\n\n\n
[docs]class AssetsDefinition(ResourceAddable, RequiresResources, IHasInternalInit):\n """Defines a set of assets that are produced by the same op or graph.\n\n AssetsDefinitions are typically not instantiated directly, but rather produced using the\n :py:func:`@asset <asset>` or :py:func:`@multi_asset <multi_asset>` decorators.\n """\n\n _node_def: NodeDefinition\n _keys_by_input_name: Mapping[str, AssetKey]\n _keys_by_output_name: Mapping[str, AssetKey]\n _partitions_def: Optional[PartitionsDefinition]\n _partition_mappings: Mapping[AssetKey, PartitionMapping]\n _asset_deps: Mapping[AssetKey, AbstractSet[AssetKey]]\n _resource_defs: Mapping[str, ResourceDefinition]\n _group_names_by_key: Mapping[AssetKey, str]\n _selected_asset_keys: AbstractSet[AssetKey]\n _can_subset: bool\n _metadata_by_key: Mapping[AssetKey, ArbitraryMetadataMapping]\n _freshness_policies_by_key: Mapping[AssetKey, FreshnessPolicy]\n _auto_materialize_policies_by_key: Mapping[AssetKey, AutoMaterializePolicy]\n _backfill_policy: Optional[BackfillPolicy]\n _code_versions_by_key: Mapping[AssetKey, Optional[str]]\n _descriptions_by_key: Mapping[AssetKey, str]\n _selected_asset_check_keys: AbstractSet[AssetCheckKey]\n\n def __init__(\n self,\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]] = None,\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]] = None,\n selected_asset_keys: Optional[AbstractSet[AssetKey]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]] = None,\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]] = None,\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]] = None,\n # if adding new fields, make sure to handle them in the with_attributes, from_graph, and\n # get_attributes_dict methods\n ):\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .graph_definition import GraphDefinition\n\n if isinstance(node_def, GraphDefinition):\n _validate_graph_def(node_def)\n\n self._node_def = node_def\n self._keys_by_input_name = check.mapping_param(\n keys_by_input_name,\n "keys_by_input_name",\n key_type=str,\n value_type=AssetKey,\n )\n self._keys_by_output_name = check.mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n\n check.opt_mapping_param(\n check_specs_by_output_name,\n "check_specs_by_output_name",\n key_type=str,\n value_type=AssetCheckSpec,\n )\n\n # if not specified assume all output assets depend on all input assets\n all_asset_keys = set(keys_by_output_name.values())\n input_asset_keys = set(keys_by_input_name.values())\n\n self._partitions_def = partitions_def\n self._partition_mappings = partition_mappings or {}\n builtin_partition_mappings = get_builtin_partition_mapping_types()\n for asset_key, partition_mapping in self._partition_mappings.items():\n if not isinstance(partition_mapping, builtin_partition_mappings):\n warnings.warn(\n f"Non-built-in PartitionMappings, such as {type(partition_mapping).__name__} "\n "are deprecated and will not work with asset reconciliation. The built-in "\n "partition mappings are "\n + ", ".join(\n builtin_partition_mapping.__name__\n for builtin_partition_mapping in builtin_partition_mappings\n )\n + ".",\n category=DeprecationWarning,\n )\n\n if asset_key not in input_asset_keys:\n check.failed(\n f"While constructing AssetsDefinition outputting {all_asset_keys}, received a"\n f" partition mapping for {asset_key} that is not defined in the set of upstream"\n f" assets: {input_asset_keys}"\n )\n\n self._asset_deps = asset_deps or {\n out_asset_key: set(keys_by_input_name.values()) for out_asset_key in all_asset_keys\n }\n check.invariant(\n set(self._asset_deps.keys()) == all_asset_keys,\n "The set of asset keys with dependencies specified in the asset_deps argument must "\n "equal the set of asset keys produced by this AssetsDefinition. \\n"\n f"asset_deps keys: {set(self._asset_deps.keys())} \\n"\n f"expected keys: {all_asset_keys}",\n )\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs")\n )\n\n group_names_by_key = (\n check.mapping_param(group_names_by_key, "group_names_by_key")\n if group_names_by_key\n else {}\n )\n self._group_names_by_key = {}\n # assets that don't have a group name get a DEFAULT_GROUP_NAME\n for key in all_asset_keys:\n group_name = group_names_by_key.get(key)\n self._group_names_by_key[key] = validate_group_name(group_name)\n\n all_check_keys = {spec.key for spec in (check_specs_by_output_name or {}).values()}\n\n # NOTE: this logic mirrors subsetting at the asset layer. This is ripe for consolidation.\n if selected_asset_keys is None and selected_asset_check_keys is None:\n # if no selections, include everything\n self._selected_asset_keys = all_asset_keys\n self._selected_asset_check_keys = all_check_keys\n else:\n self._selected_asset_keys = selected_asset_keys or set()\n\n if selected_asset_check_keys is None:\n # if assets were selected but checks are None, then include all checks for selected\n # assets\n self._selected_asset_check_keys = {\n key for key in all_check_keys if key.asset_key in self._selected_asset_keys\n }\n else:\n # otherwise, use the selected checks\n self._selected_asset_check_keys = selected_asset_check_keys\n\n self._check_specs_by_output_name = {\n name: spec\n for name, spec in (check_specs_by_output_name or {}).items()\n if spec.key in self._selected_asset_check_keys\n }\n self._check_specs_by_key = {\n spec.key: spec for spec in self._check_specs_by_output_name.values()\n }\n\n self._can_subset = can_subset\n\n self._code_versions_by_key = {}\n self._metadata_by_key = dict(\n check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n )\n self._descriptions_by_key = dict(\n check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n )\n for output_name, asset_key in keys_by_output_name.items():\n output_def, _ = node_def.resolve_output_to_origin(output_name, None)\n self._metadata_by_key[asset_key] = merge_dicts(\n output_def.metadata,\n self._metadata_by_key.get(asset_key, {}),\n )\n # We construct description from three sources of truth here. This\n # highly unfortunate. See commentary in @multi_asset's call to dagster_internal_init.\n description = (\n self._descriptions_by_key.get(asset_key, output_def.description)\n or node_def.description\n )\n if description:\n self._descriptions_by_key[asset_key] = description\n self._code_versions_by_key[asset_key] = output_def.code_version\n\n for key, freshness_policy in (freshness_policies_by_key or {}).items():\n check.param_invariant(\n not (\n freshness_policy\n and self._partitions_def is not None\n and not isinstance(self._partitions_def, TimeWindowPartitionsDefinition)\n ),\n "freshness_policies_by_key",\n "FreshnessPolicies are currently unsupported for assets with partitions of type"\n f" {type(self._partitions_def)}.",\n )\n\n self._freshness_policies_by_key = check.opt_mapping_param(\n freshness_policies_by_key,\n "freshness_policies_by_key",\n key_type=AssetKey,\n value_type=FreshnessPolicy,\n )\n\n self._auto_materialize_policies_by_key = check.opt_mapping_param(\n auto_materialize_policies_by_key,\n "auto_materialize_policies_by_key",\n key_type=AssetKey,\n value_type=AutoMaterializePolicy,\n )\n\n self._backfill_policy = check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n )\n\n if self._partitions_def is None:\n # check if backfill policy is BackfillPolicyType.SINGLE_RUN if asset is not partitioned\n check.param_invariant(\n (\n backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n _validate_self_deps(\n input_keys=self._keys_by_input_name.values(),\n output_keys=self._selected_asset_keys,\n partition_mappings=self._partition_mappings,\n partitions_def=self._partitions_def,\n )\n\n @staticmethod\n def dagster_internal_init(\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition],\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]],\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]],\n selected_asset_keys: Optional[AbstractSet[AssetKey]],\n can_subset: bool,\n resource_defs: Optional[Mapping[str, object]],\n group_names_by_key: Optional[Mapping[AssetKey, str]],\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]],\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]],\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]],\n backfill_policy: Optional[BackfillPolicy],\n descriptions_by_key: Optional[Mapping[AssetKey, str]],\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n return AssetsDefinition(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=node_def,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n asset_deps=asset_deps,\n selected_asset_keys=selected_asset_keys,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n metadata_by_key=metadata_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n descriptions_by_key=descriptions_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=selected_asset_check_keys,\n )\n\n def __call__(self, *args: object, **kwargs: object) -> object:\n from .composition import is_in_composition\n from .graph_definition import GraphDefinition\n\n # defer to GraphDefinition.__call__ for graph backed assets, or if invoked in composition\n if isinstance(self.node_def, GraphDefinition) or is_in_composition():\n return self._node_def(*args, **kwargs)\n\n # invoke against self to allow assets def information to be used\n return direct_invocation_result(self, *args, **kwargs)\n\n
[docs] @public\n @experimental_param(param="resource_defs")\n @staticmethod\n def from_graph(\n graph_def: "GraphDefinition",\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from a GraphDefinition.\n\n Args:\n graph_def (GraphDefinition): The GraphDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated graph to their corresponding asset keys. If not provided,\n the input asset keys will be created from the graph input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated graph to their corresponding asset keys. If not provided,\n the output asset keys will be created from the graph output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the graph depend on all assets that are consumed by that\n graph. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the graph.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n body of ops in the graph during execution.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=graph_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n resource_defs=resource_defs,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n check_specs=check_specs,\n )
\n\n
[docs] @public\n @staticmethod\n def from_op(\n op_def: OpDefinition,\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from an OpDefinition.\n\n Args:\n op_def (OpDefinition): The OpDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated op to their corresponding asset keys. If not provided,\n the input asset keys will be created from the op input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated op to their corresponding asset keys. If not provided,\n the output asset keys will be created from the op output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the op depend on all assets that are consumed by that\n op. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=op_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n )
\n\n @staticmethod\n def _from_node(\n node_def: Union[OpDefinition, "GraphDefinition"],\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n from dagster._core.definitions.decorators.asset_decorator import (\n _validate_and_assign_output_names_to_check_specs,\n )\n\n node_def = check.inst_param(node_def, "node_def", NodeDefinition)\n keys_by_input_name = _infer_keys_by_input_names(\n node_def,\n check.opt_mapping_param(\n keys_by_input_name, "keys_by_input_name", key_type=str, value_type=AssetKey\n ),\n )\n keys_by_output_name = check.opt_mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n internal_asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n transformed_internal_asset_deps: Dict[AssetKey, AbstractSet[AssetKey]] = {}\n if internal_asset_deps:\n for output_name, asset_keys in internal_asset_deps.items():\n check.invariant(\n output_name in keys_by_output_name,\n f"output_name {output_name} specified in internal_asset_deps does not exist"\n " in the decorated function",\n )\n transformed_internal_asset_deps[keys_by_output_name[output_name]] = asset_keys\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(keys_by_output_name.values())\n )\n\n keys_by_output_name = _infer_keys_by_output_names(\n node_def, keys_by_output_name or {}, check_specs_by_output_name\n )\n\n keys_by_output_name_with_prefix: Dict[str, AssetKey] = {}\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n for output_name, key in keys_by_output_name.items():\n # add key_prefix to the beginning of each asset key\n key_with_key_prefix = AssetKey(\n list(filter(None, [*(key_prefix_list or []), *key.path]))\n )\n keys_by_output_name_with_prefix[output_name] = key_with_key_prefix\n\n check.param_invariant(\n group_name is None or group_names_by_output_name is None,\n "group_name",\n "Cannot use both group_name and group_names_by_output_name",\n )\n\n if group_name:\n group_names_by_key = {\n asset_key: group_name for asset_key in keys_by_output_name_with_prefix.values()\n }\n elif group_names_by_output_name:\n group_names_by_key = {\n keys_by_output_name_with_prefix[output_name]: group_name\n for output_name, group_name in group_names_by_output_name.items()\n if group_name is not None\n }\n else:\n group_names_by_key = None\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name_with_prefix,\n node_def=node_def,\n asset_deps=transformed_internal_asset_deps or None,\n partitions_def=check.opt_inst_param(\n partitions_def,\n "partitions_def",\n PartitionsDefinition,\n ),\n group_names_by_key=group_names_by_key,\n resource_defs=resource_defs,\n partition_mappings=(\n {\n keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in partition_mappings.items()\n }\n if partition_mappings\n else None\n ),\n metadata_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: metadata\n for output_name, metadata in metadata_by_output_name.items()\n if metadata is not None\n }\n if metadata_by_output_name\n else None\n ),\n freshness_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: freshness_policy\n for output_name, freshness_policy in freshness_policies_by_output_name.items()\n if freshness_policy is not None\n }\n if freshness_policies_by_output_name\n else None\n ),\n auto_materialize_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: auto_materialize_policy\n for output_name, auto_materialize_policy in auto_materialize_policies_by_output_name.items()\n if auto_materialize_policy is not None\n }\n if auto_materialize_policies_by_output_name\n else None\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n descriptions_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: description\n for output_name, description in descriptions_by_output_name.items()\n if description is not None\n }\n if descriptions_by_output_name\n else None\n ),\n can_subset=can_subset,\n selected_asset_keys=None, # node has no subselection info\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None,\n )\n\n @public\n @property\n def can_subset(self) -> bool:\n """bool: If True, indicates that this AssetsDefinition may materialize any subset of its\n asset keys in a given computation (as opposed to being required to materialize all asset\n keys).\n """\n return self._can_subset\n\n @public\n @property\n def group_names_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the group names assigned to them. If there is no assigned group name for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._group_names_by_key\n\n @public\n @property\n def descriptions_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the descriptions assigned to them. If there is no assigned description for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._descriptions_by_key\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: Returns the OpDefinition that is used to materialize the assets in this\n AssetsDefinition.\n """\n check.invariant(\n isinstance(self._node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self._node_def)\n\n @public\n @property\n def node_def(self) -> NodeDefinition:\n """NodeDefinition: Returns the OpDefinition or GraphDefinition that is used to materialize\n the assets in this AssetsDefinition.\n """\n return self._node_def\n\n @public\n @property\n def asset_deps(self) -> Mapping[AssetKey, AbstractSet[AssetKey]]:\n """Maps assets that are produced by this definition to assets that they depend on. The\n dependencies can be either "internal", meaning that they refer to other assets that are\n produced by this definition, or "external", meaning that they refer to assets that aren't\n produced by this definition.\n """\n return self._asset_deps\n\n @property\n def input_names(self) -> Iterable[str]:\n """Iterable[str]: The set of input names of the underlying NodeDefinition for this\n AssetsDefinition.\n """\n return self.keys_by_input_name.keys()\n\n @public\n @property\n def key(self) -> AssetKey:\n """AssetKey: The asset key associated with this AssetsDefinition. If this AssetsDefinition\n has more than one asset key, this will produce an error.\n """\n check.invariant(\n len(self.keys) == 1,\n "Tried to retrieve asset key from an assets definition with multiple asset keys: "\n + ", ".join([str(ak.to_string()) for ak in self._keys_by_output_name.values()]),\n )\n\n return next(iter(self.keys))\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Mapping[str, ResourceDefinition]: A mapping from resource name to ResourceDefinition for\n the resources bound to this AssetsDefinition.\n """\n return dict(self._resource_defs)\n\n @public\n @property\n def keys(self) -> AbstractSet[AssetKey]:\n """AbstractSet[AssetKey]: The asset keys associated with this AssetsDefinition."""\n return self._selected_asset_keys\n\n @public\n @property\n def dependency_keys(self) -> Iterable[AssetKey]:\n """Iterable[AssetKey]: The asset keys which are upstream of any asset included in this\n AssetsDefinition.\n """\n # the input asset keys that are directly upstream of a selected asset key\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n input_keys = set(self._keys_by_input_name.values())\n return upstream_keys.intersection(input_keys)\n\n @property\n def node_keys_by_output_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each output on the underlying NodeDefinition."""\n return self._keys_by_output_name\n\n @property\n def node_keys_by_input_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each input on the underlying NodeDefinition."""\n return self._keys_by_input_name\n\n @property\n def check_specs_by_output_name(self) -> Mapping[str, AssetCheckSpec]:\n return self._check_specs_by_output_name\n\n def get_spec_for_check_key(self, asset_check_key: AssetCheckKey) -> AssetCheckSpec:\n return self._check_specs_by_key[asset_check_key]\n\n @property\n def keys_by_output_name(self) -> Mapping[str, AssetKey]:\n return {\n name: key for name, key in self.node_keys_by_output_name.items() if key in self.keys\n }\n\n @property\n def keys_by_input_name(self) -> Mapping[str, AssetKey]:\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n return {\n name: key for name, key in self.node_keys_by_input_name.items() if key in upstream_keys\n }\n\n @property\n def freshness_policies_by_key(self) -> Mapping[AssetKey, FreshnessPolicy]:\n return self._freshness_policies_by_key\n\n @property\n def auto_materialize_policies_by_key(self) -> Mapping[AssetKey, AutoMaterializePolicy]:\n return self._auto_materialize_policies_by_key\n\n @property\n def backfill_policy(self) -> Optional[BackfillPolicy]:\n return self._backfill_policy\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Optional[PartitionsDefinition]: The PartitionsDefinition for this AssetsDefinition (if any)."""\n return self._partitions_def\n\n @property\n def metadata_by_key(self) -> Mapping[AssetKey, ArbitraryMetadataMapping]:\n return self._metadata_by_key\n\n @property\n def code_versions_by_key(self) -> Mapping[AssetKey, Optional[str]]:\n return self._code_versions_by_key\n\n @property\n def partition_mappings(self) -> Mapping[AssetKey, PartitionMapping]:\n return self._partition_mappings\n\n
[docs] @public\n def get_partition_mapping(self, in_asset_key: AssetKey) -> Optional[PartitionMapping]:\n """Returns the partition mapping between keys in this AssetsDefinition and a given input\n asset key (if any).\n """\n return self._partition_mappings.get(in_asset_key)
\n\n @public\n @property\n def check_specs(self) -> Iterable[AssetCheckSpec]:\n """Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\n be executed while materializing the assets.\n\n Returns:\n Iterable[AssetsCheckSpec]:\n """\n return self._check_specs_by_output_name.values()\n\n @property\n def check_keys(self) -> AbstractSet[AssetCheckKey]:\n """Returns the selected asset checks associated by this AssetsDefinition.\n\n Returns:\n AbstractSet[Tuple[AssetKey, str]]: The selected asset checks. An asset check is\n identified by the asset key and the name of the check.\n """\n return self._selected_asset_check_keys\n\n def is_asset_executable(self, asset_key: AssetKey) -> bool:\n """Returns True if the asset key is materializable by this AssetsDefinition.\n\n Args:\n asset_key (AssetKey): The asset key to check.\n\n Returns:\n bool: True if the asset key is materializable by this AssetsDefinition.\n """\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.is_executable(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def asset_execution_type_for_asset(self, asset_key: AssetKey) -> AssetExecutionType:\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.str_to_enum(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def get_partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n return self._partition_mappings.get(self._keys_by_input_name[input_name])\n\n def infer_partition_mapping(\n self, upstream_asset_key: AssetKey, upstream_partitions_def: Optional[PartitionsDefinition]\n ) -> PartitionMapping:\n with disable_dagster_warnings():\n partition_mapping = self._partition_mappings.get(upstream_asset_key)\n return infer_partition_mapping(\n partition_mapping, self._partitions_def, upstream_partitions_def\n )\n\n def get_output_name_for_asset_key(self, key: AssetKey) -> str:\n for output_name, asset_key in self.keys_by_output_name.items():\n if key == asset_key:\n return output_name\n\n raise DagsterInvariantViolationError(\n f"Asset key {key.to_user_string()} not found in AssetsDefinition"\n )\n\n def get_op_def_for_asset_key(self, key: AssetKey) -> OpDefinition:\n """If this is an op-backed asset, returns the op def. If it's a graph-backed asset,\n returns the op def within the graph that produces the given asset key.\n """\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin_op_def(output_name)\n\n def with_attributes(\n self,\n *,\n output_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n input_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policy: Optional[\n Union[FreshnessPolicy, Mapping[AssetKey, FreshnessPolicy]]\n ] = None,\n auto_materialize_policy: Optional[\n Union[AutoMaterializePolicy, Mapping[AssetKey, AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ) -> "AssetsDefinition":\n output_asset_key_replacements = check.opt_mapping_param(\n output_asset_key_replacements,\n "output_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n input_asset_key_replacements = check.opt_mapping_param(\n input_asset_key_replacements,\n "input_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n group_names_by_key = check.opt_mapping_param(\n group_names_by_key, "group_names_by_key", key_type=AssetKey, value_type=str\n )\n descriptions_by_key = check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n metadata_by_key = check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n if group_names_by_key:\n group_name_conflicts = [\n asset_key\n for asset_key in group_names_by_key\n if asset_key in self.group_names_by_key\n and self.group_names_by_key[asset_key] != DEFAULT_GROUP_NAME\n ]\n if group_name_conflicts:\n raise DagsterInvalidDefinitionError(\n "Group name already exists on assets"\n f" {', '.join(asset_key.to_user_string() for asset_key in group_name_conflicts)}"\n )\n\n replaced_group_names_by_key = {\n output_asset_key_replacements.get(key, key): group_name\n for key, group_name in self.group_names_by_key.items()\n }\n\n if freshness_policy:\n freshness_policy_conflicts = (\n self.freshness_policies_by_key.keys()\n if isinstance(freshness_policy, FreshnessPolicy)\n else (freshness_policy.keys() & self.freshness_policies_by_key.keys())\n )\n if freshness_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "FreshnessPolicy already exists on assets"\n f" {', '.join(key.to_string() for key in freshness_policy_conflicts)}"\n )\n\n replaced_freshness_policies_by_key = {}\n for key in self.keys:\n if isinstance(freshness_policy, FreshnessPolicy):\n replaced_freshness_policy = freshness_policy\n elif freshness_policy:\n replaced_freshness_policy = freshness_policy.get(key)\n else:\n replaced_freshness_policy = self.freshness_policies_by_key.get(key)\n\n if replaced_freshness_policy:\n replaced_freshness_policies_by_key[output_asset_key_replacements.get(key, key)] = (\n replaced_freshness_policy\n )\n\n if auto_materialize_policy:\n auto_materialize_policy_conflicts = (\n self.auto_materialize_policies_by_key.keys()\n if isinstance(auto_materialize_policy, AutoMaterializePolicy)\n else (auto_materialize_policy.keys() & self.auto_materialize_policies_by_key.keys())\n )\n if auto_materialize_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "AutoMaterializePolicy already exists on assets"\n f" {', '.join(key.to_string() for key in auto_materialize_policy_conflicts)}"\n )\n\n replaced_auto_materialize_policies_by_key = {}\n for key in self.keys:\n if isinstance(auto_materialize_policy, AutoMaterializePolicy):\n replaced_auto_materialize_policy = auto_materialize_policy\n elif auto_materialize_policy:\n replaced_auto_materialize_policy = auto_materialize_policy.get(key)\n else:\n replaced_auto_materialize_policy = self.auto_materialize_policies_by_key.get(key)\n\n if replaced_auto_materialize_policy:\n replaced_auto_materialize_policies_by_key[\n output_asset_key_replacements.get(key, key)\n ] = replaced_auto_materialize_policy\n\n replaced_descriptions_by_key = {\n output_asset_key_replacements.get(key, key): description\n for key, description in descriptions_by_key.items()\n }\n\n if not metadata_by_key:\n metadata_by_key = self.metadata_by_key\n\n replaced_metadata_by_key = {\n output_asset_key_replacements.get(key, key): metadata\n for key, metadata in metadata_by_key.items()\n }\n\n replaced_attributes = dict(\n keys_by_input_name={\n input_name: input_asset_key_replacements.get(key, key)\n for input_name, key in self._keys_by_input_name.items()\n },\n keys_by_output_name={\n output_name: output_asset_key_replacements.get(key, key)\n for output_name, key in self._keys_by_output_name.items()\n },\n partition_mappings={\n input_asset_key_replacements.get(key, key): partition_mapping\n for key, partition_mapping in self._partition_mappings.items()\n },\n asset_deps={\n # replace both the keys and the values in this mapping\n output_asset_key_replacements.get(key, key): {\n input_asset_key_replacements.get(\n upstream_key,\n output_asset_key_replacements.get(upstream_key, upstream_key),\n )\n for upstream_key in value\n }\n for key, value in self.asset_deps.items()\n },\n selected_asset_keys={\n output_asset_key_replacements.get(key, key) for key in self._selected_asset_keys\n },\n group_names_by_key={\n **replaced_group_names_by_key,\n **group_names_by_key,\n },\n metadata_by_key=replaced_metadata_by_key,\n freshness_policies_by_key=replaced_freshness_policies_by_key,\n auto_materialize_policies_by_key=replaced_auto_materialize_policies_by_key,\n backfill_policy=backfill_policy if backfill_policy else self.backfill_policy,\n descriptions_by_key=replaced_descriptions_by_key,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n def _subset_graph_backed_asset(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n ):\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n if not isinstance(self.node_def, GraphDefinition):\n raise DagsterInvalidInvocationError(\n "Method _subset_graph_backed_asset cannot subset an asset that is not a graph"\n )\n\n # All asset keys in selected_asset_keys are outputted from the same top-level graph backed asset\n dep_node_handles_by_asset_key = get_dep_node_handles_of_graph_backed_asset(\n self.node_def, self\n )\n op_selection: List[str] = []\n for asset_key in selected_asset_keys:\n dep_node_handles = dep_node_handles_by_asset_key[asset_key]\n for dep_node_handle in dep_node_handles:\n op_selection.append(".".join(dep_node_handle.path[1:]))\n\n return get_graph_subset(self.node_def, op_selection)\n\n def subset_for(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n """Create a subset of this AssetsDefinition that will only materialize the assets and checks\n in the selected set.\n\n Args:\n selected_asset_keys (AbstractSet[AssetKey]): The total set of asset keys\n selected_asset_check_keys (AbstractSet[AssetCheckKey]): The selected asset checks\n """\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n check.invariant(\n self.can_subset,\n f"Attempted to subset AssetsDefinition for {self.node_def.name}, but can_subset=False.",\n )\n\n # Set of assets within selected_asset_keys which are outputted by this AssetDefinition\n asset_subselection = selected_asset_keys & self.keys\n if selected_asset_check_keys is None:\n # filter to checks that target selected asset keys\n asset_check_subselection = {\n key for key in self.check_keys if key.asset_key in asset_subselection\n }\n else:\n asset_check_subselection = selected_asset_check_keys & self.check_keys\n\n # Early escape if all assets in AssetsDefinition are selected\n if asset_subselection == self.keys and asset_check_subselection == self.check_keys:\n return self\n elif isinstance(self.node_def, GraphDefinition): # Node is graph-backed asset\n check.invariant(\n selected_asset_check_keys == self.check_keys,\n "Subsetting graph-backed assets with checks is not yet supported",\n )\n\n subsetted_node = self._subset_graph_backed_asset(\n asset_subselection,\n )\n\n # The subsetted node should only include asset inputs that are dependencies of the\n # selected set of assets.\n subsetted_input_names = [input_def.name for input_def in subsetted_node.input_defs]\n subsetted_keys_by_input_name = {\n key: value\n for key, value in self.node_keys_by_input_name.items()\n if key in subsetted_input_names\n }\n\n subsetted_output_names = [output_def.name for output_def in subsetted_node.output_defs]\n subsetted_keys_by_output_name = {\n key: value\n for key, value in self.node_keys_by_output_name.items()\n if key in subsetted_output_names\n }\n\n # An op within the graph-backed asset that yields multiple assets will be run\n # any time any of its output assets are selected. Thus, if an op yields multiple assets\n # and only one of them is selected, the op will still run and potentially unexpectedly\n # materialize the unselected asset.\n #\n # Thus, we include unselected assets that may be accidentally materialized in\n # keys_by_output_name and asset_deps so that the webserver can populate an warning when\n # this occurs. This is the same behavior as multi-asset subsetting.\n\n subsetted_asset_deps = {\n out_asset_key: set(self._keys_by_input_name.values())\n for out_asset_key in subsetted_keys_by_output_name.values()\n }\n\n replaced_attributes = dict(\n keys_by_input_name=subsetted_keys_by_input_name,\n keys_by_output_name=subsetted_keys_by_output_name,\n node_def=subsetted_node,\n asset_deps=subsetted_asset_deps,\n selected_asset_keys=selected_asset_keys & self.keys,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n else:\n # multi_asset subsetting\n replaced_attributes = {\n "selected_asset_keys": asset_subselection,\n "selected_asset_check_keys": asset_check_subselection,\n }\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n
[docs] @public\n def to_source_assets(self) -> Sequence[SourceAsset]:\n """Returns a SourceAsset for each asset in this definition.\n\n Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\n corresponding asset\n """\n return [\n self._output_to_source_asset(output_name)\n for output_name in self.keys_by_output_name.keys()\n ]
\n\n
[docs] @public\n def to_source_asset(self, key: Optional[CoercibleToAssetKey] = None) -> SourceAsset:\n """Returns a representation of this asset as a :py:class:`SourceAsset`.\n\n If this is a multi-asset, the "key" argument allows selecting which asset to return a\n SourceAsset representation of.\n\n Args:\n key (Optional[Union[str, Sequence[str], AssetKey]]]): If this is a multi-asset, select\n which asset to return a SourceAsset representation of. If not a multi-asset, this\n can be left as None.\n\n Returns:\n SourceAsset\n """\n if len(self.keys) > 1:\n check.invariant(\n key is not None,\n "The 'key' argument is required when there are multiple assets to choose from",\n )\n\n if key is not None:\n resolved_key = AssetKey.from_coercible(key)\n check.invariant(\n resolved_key in self.keys, f"Key {resolved_key} not found in AssetsDefinition"\n )\n else:\n resolved_key = self.key\n\n output_names = [\n output_name\n for output_name, ak in self.keys_by_output_name.items()\n if ak == resolved_key\n ]\n check.invariant(len(output_names) == 1)\n return self._output_to_source_asset(output_names[0])
\n\n def _output_to_source_asset(self, output_name: str) -> SourceAsset:\n with disable_dagster_warnings():\n output_def = self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0]\n key = self._keys_by_output_name[output_name]\n\n return SourceAsset(\n key=key,\n metadata=output_def.metadata,\n io_manager_key=output_def.io_manager_key,\n description=output_def.description,\n resource_defs=self.resource_defs,\n partitions_def=self.partitions_def,\n group_name=self.group_names_by_key[key],\n )\n\n def get_io_manager_key_for_asset_key(self, key: AssetKey) -> str:\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0].io_manager_key\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n yield from self.node_def.get_resource_requirements() # type: ignore[attr-defined]\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this AssetsDefinition."""\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n def __str__(self):\n if len(self.keys) == 1:\n return f"AssetsDefinition with key {self.key.to_string()}"\n else:\n asset_keys = ", ".join(sorted(([asset_key.to_string() for asset_key in self.keys])))\n return f"AssetsDefinition with keys {asset_keys}"\n\n @property\n def unique_id(self) -> str:\n """A unique identifier for the AssetsDefinition that's stable across processes."""\n return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()\n\n def with_resources(self, resource_defs: Mapping[str, ResourceDefinition]) -> "AssetsDefinition":\n attributes_dict = self.get_attributes_dict()\n attributes_dict["resource_defs"] = merge_resource_defs(\n old_resource_defs=self.resource_defs,\n resource_defs_to_merge_in=resource_defs,\n requires_resources=self,\n )\n return self.__class__(**attributes_dict)\n\n def get_attributes_dict(self) -> Dict[str, Any]:\n return dict(\n keys_by_input_name=self._keys_by_input_name,\n keys_by_output_name=self._keys_by_output_name,\n node_def=self._node_def,\n partitions_def=self._partitions_def,\n partition_mappings=self._partition_mappings,\n asset_deps=self.asset_deps,\n selected_asset_keys=self._selected_asset_keys,\n can_subset=self._can_subset,\n resource_defs=self._resource_defs,\n group_names_by_key=self._group_names_by_key,\n metadata_by_key=self._metadata_by_key,\n freshness_policies_by_key=self._freshness_policies_by_key,\n auto_materialize_policies_by_key=self._auto_materialize_policies_by_key,\n backfill_policy=self._backfill_policy,\n descriptions_by_key=self._descriptions_by_key,\n check_specs_by_output_name=self._check_specs_by_output_name,\n selected_asset_check_keys=self._selected_asset_check_keys,\n )
\n\n\ndef _infer_keys_by_input_names(\n node_def: Union["GraphDefinition", OpDefinition], keys_by_input_name: Mapping[str, AssetKey]\n) -> Mapping[str, AssetKey]:\n all_input_names = [input_def.name for input_def in node_def.input_defs]\n if keys_by_input_name:\n check.invariant(\n set(keys_by_input_name.keys()) == set(all_input_names),\n "The set of input names keys specified in the keys_by_input_name argument must "\n f"equal the set of asset keys inputted by '{node_def.name}'. \\n"\n f"keys_by_input_name keys: {set(keys_by_input_name.keys())} \\n"\n f"expected keys: {all_input_names}",\n )\n\n # If asset key is not supplied in keys_by_input_name, create asset key\n # from input name\n inferred_input_names_by_asset_key: Dict[str, AssetKey] = {\n input_name: keys_by_input_name.get(input_name, AssetKey([input_name]))\n for input_name in all_input_names\n }\n\n return inferred_input_names_by_asset_key\n\n\ndef _infer_keys_by_output_names(\n node_def: Union["GraphDefinition", OpDefinition],\n keys_by_output_name: Mapping[str, AssetKey],\n check_specs_by_output_name: Mapping[str, AssetCheckSpec],\n) -> Mapping[str, AssetKey]:\n output_names = [output_def.name for output_def in node_def.output_defs]\n if keys_by_output_name:\n overlapping_asset_and_check_outputs = set(keys_by_output_name.keys()) & set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n not overlapping_asset_and_check_outputs,\n "The set of output names associated with asset keys and checks overlap:"\n f" {overlapping_asset_and_check_outputs}",\n )\n\n union_asset_and_check_outputs = set(keys_by_output_name.keys()) | set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n union_asset_and_check_outputs == set(output_names),\n "The union of the set of output names keys specified in the keys_by_output_name and"\n " check_specs_by_output_name arguments must equal the set of asset keys outputted by"\n f" {node_def.name}. union keys:"\n f" {union_asset_and_check_outputs} \\nexpected keys: {set(output_names)}",\n )\n\n inferred_keys_by_output_names: Dict[str, AssetKey] = {\n output_name: asset_key for output_name, asset_key in keys_by_output_name.items()\n }\n\n if (\n len(output_names) == 1\n and output_names[0] not in keys_by_output_name\n and output_names[0] not in check_specs_by_output_name\n and output_names[0] == "result"\n ):\n # If there is only one output and the name is the default "result", generate asset key\n # from the name of the node\n inferred_keys_by_output_names[output_names[0]] = AssetKey([node_def.name])\n\n for output_name in output_names:\n if (\n output_name not in inferred_keys_by_output_names\n and output_name not in check_specs_by_output_name\n ):\n inferred_keys_by_output_names[output_name] = AssetKey([output_name])\n return inferred_keys_by_output_names\n\n\ndef _validate_graph_def(graph_def: "GraphDefinition", prefix: Optional[Sequence[str]] = None):\n """Ensure that all leaf nodes are mapped to graph outputs."""\n from dagster._core.definitions.graph_definition import GraphDefinition, create_adjacency_lists\n\n prefix = check.opt_sequence_param(prefix, "prefix")\n\n # recursively validate any sub-graphs\n for inner_node_def in graph_def.node_defs:\n if isinstance(inner_node_def, GraphDefinition):\n _validate_graph_def(inner_node_def, prefix=[*prefix, graph_def.name])\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph_def.nodes, graph_def.dependency_structure)\n leaf_nodes = {\n node_name for node_name, downstream_nodes in forward_edges.items() if not downstream_nodes\n }\n\n # set of nodes that have outputs mapped to a graph output\n mapped_output_nodes = {\n output_mapping.maps_from.node_name for output_mapping in graph_def.output_mappings\n }\n\n # leaf nodes which do not have an associated mapped output\n unmapped_leaf_nodes = {".".join([*prefix, node]) for node in leaf_nodes - mapped_output_nodes}\n\n check.invariant(\n not unmapped_leaf_nodes,\n f"All leaf nodes within graph '{graph_def.name}' must generate outputs which are mapped"\n " to outputs of the graph, and produce assets. The following leaf node(s) are"\n f" non-asset producing ops: {unmapped_leaf_nodes}. This behavior is not currently"\n " supported because these ops are not required for the creation of the associated"\n " asset(s).",\n )\n\n\ndef _validate_self_deps(\n input_keys: Iterable[AssetKey],\n output_keys: Iterable[AssetKey],\n partition_mappings: Mapping[AssetKey, PartitionMapping],\n partitions_def: Optional[PartitionsDefinition],\n) -> None:\n output_keys_set = set(output_keys)\n for input_key in input_keys:\n if input_key in output_keys_set:\n if input_key in partition_mappings:\n partition_mapping = partition_mappings[input_key]\n time_window_partition_mapping = get_self_dep_time_window_partition_mapping(\n partition_mapping, partitions_def\n )\n if (\n time_window_partition_mapping is not None\n and (time_window_partition_mapping.start_offset or 0) < 0\n and (time_window_partition_mapping.end_offset or 0) < 0\n ):\n continue\n\n raise DagsterInvalidDefinitionError(\n f'Asset "{input_key.to_user_string()}" depends on itself. Assets can only depend'\n " on themselves if they are:\\n(a) time-partitioned and each partition depends on"\n " earlier partitions\\n(b) multipartitioned, with one time dimension that depends"\n " on earlier time partitions"\n )\n\n\ndef get_self_dep_time_window_partition_mapping(\n partition_mapping: Optional[PartitionMapping], partitions_def: Optional[PartitionsDefinition]\n) -> Optional[TimeWindowPartitionMapping]:\n """Returns a time window partition mapping dimension of the provided partition mapping,\n if exists.\n """\n if isinstance(partition_mapping, TimeWindowPartitionMapping):\n return partition_mapping\n elif isinstance(partition_mapping, MultiPartitionMapping):\n if not isinstance(partitions_def, MultiPartitionsDefinition):\n return None\n\n time_partition_mapping = partition_mapping.downstream_mappings_by_upstream_dimension.get(\n partitions_def.time_window_dimension.name\n )\n\n if time_partition_mapping is None or not isinstance(\n time_partition_mapping.partition_mapping, TimeWindowPartitionMapping\n ):\n return None\n\n return time_partition_mapping.partition_mapping\n return None\n
", "current_page_name": "_modules/dagster/_core/definitions/assets", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.assets"}, "auto_materialize_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_policy

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Dict, FrozenSet, NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.auto_materialize_rule import (\n        AutoMaterializeRule,\n        AutoMaterializeRuleSnapshot,\n    )\n\n\nclass AutoMaterializePolicySerializer(NamedTupleSerializer):\n    def before_unpack(\n        self, context: UnpackContext, unpacked_dict: Dict[str, UnpackedValue]\n    ) -> Dict[str, UnpackedValue]:\n        from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n        backcompat_map = {\n            "on_missing": AutoMaterializeRule.materialize_on_missing(),\n            "on_new_parent_data": AutoMaterializeRule.materialize_on_parent_updated(),\n            "for_freshness": AutoMaterializeRule.materialize_on_required_for_freshness(),\n        }\n\n        # determine if this namedtuple was serialized with the old format (booleans for rules)\n        if any(backcompat_key in unpacked_dict for backcompat_key in backcompat_map):\n            # all old policies had these rules by default\n            rules = {\n                AutoMaterializeRule.skip_on_parent_outdated(),\n                AutoMaterializeRule.skip_on_parent_missing(),\n            }\n            for backcompat_key, rule in backcompat_map.items():\n                if unpacked_dict.get(backcompat_key):\n                    rules.add(rule)\n            unpacked_dict["rules"] = frozenset(rules)\n\n        return unpacked_dict\n\n\nclass AutoMaterializePolicyType(Enum):\n    EAGER = "EAGER"\n    LAZY = "LAZY"\n\n\n
[docs]@experimental\n@whitelist_for_serdes(\n old_fields={"time_window_partition_scope_minutes": 1e-6},\n serializer=AutoMaterializePolicySerializer,\n)\nclass AutoMaterializePolicy(\n NamedTuple(\n "_AutoMaterializePolicy",\n [\n ("rules", FrozenSet["AutoMaterializeRule"]),\n ("max_materializations_per_minute", Optional[int]),\n ],\n )\n):\n """An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.\n\n Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\n asset or a partition of an asset should or should not be auto-materialized.\n\n The most common policy is `AutoMaterializePolicy.eager()`, which consists of the following rules:\n\n - `AutoMaterializeRule.materialize_on_missing()`\n Materialize an asset or a partition if it has never been materialized.\n - `AutoMaterializeRule.materialize_on_parent_updated()`\n Materialize an asset or a partition if one of its parents have been updated more recently\n than it has.\n - `AutoMaterializeRule.materialize_on_required_for_freshness()`\n Materialize an asset or a partition if it is required to satisfy a freshness policy.\n - `AutoMaterializeRule.skip_on_parent_outdated()`\n Skip materializing an asset or partition if any of its parents have ancestors that have\n been materialized more recently.\n - `AutoMaterializeRule.skip_on_parent_missing()`\n Skip materializing an asset or a partition if any parent has never been materialized or\n observed.\n\n Policies can be customized by adding or removing rules. For example, if you'd like to allow\n an asset to be materialized even if some of its parent partitions are missing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().without_rules(\n AutoMaterializeRule.skip_on_parent_missing(),\n )\n\n If you'd like an asset to wait for all of its parents to be updated before materializing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().with_rules(\n AutoMaterializeRule.skip_on_all_parents_not_updated(),\n )\n\n Lastly, the `max_materializations_per_minute` parameter, which is set to 1 by default,\n rate-limits the number of auto-materializations that can occur for a particular asset within\n a short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\n safeguard against "surprise backfills", where user-error causes auto-materialize to be\n accidentally triggered for large numbers of partitions at once.\n\n **Warning:**\n\n Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\n AutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.\n\n """\n\n def __new__(\n cls,\n rules: AbstractSet["AutoMaterializeRule"],\n max_materializations_per_minute: Optional[int] = 1,\n ):\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n check.invariant(\n max_materializations_per_minute is None or max_materializations_per_minute > 0,\n "max_materializations_per_minute must be positive. To disable rate-limiting, set it"\n " to None. To disable auto materializing, remove the policy.",\n )\n\n return super(AutoMaterializePolicy, cls).__new__(\n cls,\n rules=frozenset(check.set_param(rules, "rules", of_type=AutoMaterializeRule)),\n max_materializations_per_minute=max_materializations_per_minute,\n )\n\n @property\n def materialize_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule\n for rule in self.rules\n if rule.decision_type == AutoMaterializeDecisionType.MATERIALIZE\n }\n\n @property\n def skip_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule for rule in self.rules if rule.decision_type == AutoMaterializeDecisionType.SKIP\n }\n\n
[docs] @public\n @staticmethod\n def eager(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs an eager AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_missing(),\n AutoMaterializeRule.materialize_on_parent_updated(),\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n @staticmethod\n def lazy(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs a lazy AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n def without_rules(self, *rules_to_remove: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules removed. Raises an error\n if any of the arguments are not rules in this policy.\n """\n non_matching_rules = set(rules_to_remove).difference(self.rules)\n check.param_invariant(\n not non_matching_rules,\n "rules_to_remove",\n f"Rules {[rule for rule in rules_to_remove if rule in non_matching_rules]} do not"\n " exist in this policy.",\n )\n return self._replace(\n rules=self.rules.difference(set(rules_to_remove)),\n )
\n\n
[docs] @public\n def with_rules(self, *rules_to_add: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules added."""\n return self._replace(rules=self.rules.union(set(rules_to_add)))
\n\n @property\n def policy_type(self) -> AutoMaterializePolicyType:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n if AutoMaterializeRule.materialize_on_parent_updated() in self.rules:\n return AutoMaterializePolicyType.EAGER\n return AutoMaterializePolicyType.LAZY\n\n @property\n def rule_snapshots(self) -> Sequence["AutoMaterializeRuleSnapshot"]:\n return [rule.to_snapshot() for rule in self.rules]
\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_policy"}, "auto_materialize_rule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_rule

\nimport datetime\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Dict,\n    FrozenSet,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey, AssetKeyPartitionKey\nfrom dagster._core.definitions.freshness_based_auto_materialize import (\n    freshness_evaluation_results_for_asset_key,\n)\nfrom dagster._core.definitions.partition_mapping import IdentityPartitionMapping\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    WhitelistMap,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.caching_instance_queryer import CachingInstanceQueryer\n\nfrom .asset_graph import AssetGraph, sort_key_for_asset_partition\nfrom .partition import SerializedPartitionsSubset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_daemon_context import AssetDaemonContext\n    from dagster._core.definitions.asset_daemon_cursor import AssetDaemonCursor\n    from dagster._core.instance import DynamicPartitionsStore\n\n\n@whitelist_for_serdes\nclass AutoMaterializeDecisionType(Enum):\n    """Represents the set of results of the auto-materialize logic.\n\n    MATERIALIZE: The asset should be materialized by a run kicked off on this tick\n    SKIP: The asset should not be materialized by a run kicked off on this tick, because future\n        ticks are expected to materialize it.\n    DISCARD: The asset should not be materialized by a run kicked off on this tick, but future\n        ticks are not expected to materialize it.\n    """\n\n    MATERIALIZE = "MATERIALIZE"\n    SKIP = "SKIP"\n    DISCARD = "DISCARD"\n\n\nclass AutoMaterializeRuleEvaluationData(ABC):\n    pass\n\n\n@whitelist_for_serdes\nclass TextRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple("_TextRuleEvaluationData", [("text", str)]),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass ParentUpdatedRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_ParentUpdatedRuleEvaluationData",\n        [\n            ("updated_asset_keys", FrozenSet[AssetKey]),\n            ("will_update_asset_keys", FrozenSet[AssetKey]),\n        ],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass WaitingOnAssetsRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_WaitingOnParentRuleEvaluationData",\n        [("waiting_on_asset_keys", FrozenSet[AssetKey])],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleSnapshot(NamedTuple):\n    """A serializable snapshot of an AutoMaterializeRule for historical evaluations."""\n\n    class_name: str\n    description: str\n    decision_type: AutoMaterializeDecisionType\n\n    @staticmethod\n    def from_rule(rule: "AutoMaterializeRule") -> "AutoMaterializeRuleSnapshot":\n        return AutoMaterializeRuleSnapshot(\n            class_name=rule.__class__.__name__,\n            description=rule.description,\n            decision_type=rule.decision_type,\n        )\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleEvaluation(NamedTuple):\n    rule_snapshot: AutoMaterializeRuleSnapshot\n    evaluation_data: Optional[AutoMaterializeRuleEvaluationData]\n\n\nclass RuleEvaluationContext(NamedTuple):\n    asset_key: AssetKey\n    cursor: "AssetDaemonCursor"\n    instance_queryer: CachingInstanceQueryer\n    data_time_resolver: CachingDataTimeResolver\n    will_materialize_mapping: Mapping[AssetKey, AbstractSet[AssetKeyPartitionKey]]\n    expected_data_time_mapping: Mapping[AssetKey, Optional[datetime.datetime]]\n    candidates: AbstractSet[AssetKeyPartitionKey]\n    daemon_context: "AssetDaemonContext"\n\n    @property\n    def asset_graph(self) -> AssetGraph:\n        return self.instance_queryer.asset_graph\n\n    def materializable_in_same_run(self, child_key: AssetKey, parent_key: AssetKey) -> bool:\n        """Returns whether a child asset can be materialized in the same run as a parent asset."""\n        from dagster._core.definitions.external_asset_graph import ExternalAssetGraph\n\n        return (\n            # both assets must be materializable\n            child_key in self.asset_graph.materializable_asset_keys\n            and parent_key in self.asset_graph.materializable_asset_keys\n            # the parent must have the same partitioning\n            and self.asset_graph.have_same_partitioning(child_key, parent_key)\n            # the parent must have a simple partition mapping to the child\n            and (\n                not self.asset_graph.is_partitioned(parent_key)\n                or isinstance(\n                    self.asset_graph.get_partition_mapping(child_key, parent_key),\n                    (TimeWindowPartitionMapping, IdentityPartitionMapping),\n                )\n            )\n            # the parent must be in the same repository to be materialized alongside the candidate\n            and (\n                not isinstance(self.asset_graph, ExternalAssetGraph)\n                or self.asset_graph.get_repository_handle(child_key)\n                == self.asset_graph.get_repository_handle(parent_key)\n            )\n        )\n\n    def get_parents_that_will_not_be_materialized_on_current_tick(\n        self, *, asset_partition: AssetKeyPartitionKey\n    ) -> AbstractSet[AssetKeyPartitionKey]:\n        """Returns the set of parent asset partitions that will not be updated in the same run of\n        this asset partition if we launch a run of this asset partition on this tick.\n        """\n        return {\n            parent\n            for parent in self.asset_graph.get_parents_partitions(\n                dynamic_partitions_store=self.instance_queryer,\n                current_time=self.instance_queryer.evaluation_time,\n                asset_key=asset_partition.asset_key,\n                partition_key=asset_partition.partition_key,\n            ).parent_partitions\n            if parent not in self.will_materialize_mapping.get(parent.asset_key, set())\n            or not self.materializable_in_same_run(asset_partition.asset_key, parent.asset_key)\n        }\n\n    def get_asset_partitions_by_asset_key(\n        self,\n        asset_partitions: AbstractSet[AssetKeyPartitionKey],\n    ) -> Mapping[AssetKey, Set[AssetKeyPartitionKey]]:\n        asset_partitions_by_asset_key: Dict[AssetKey, Set[AssetKeyPartitionKey]] = defaultdict(set)\n        for parent in asset_partitions:\n            asset_partitions_by_asset_key[parent.asset_key].add(parent)\n\n        return asset_partitions_by_asset_key\n\n\nRuleEvaluationResults = Sequence[Tuple[Optional[AutoMaterializeRuleEvaluationData], AbstractSet]]\n\n\n
[docs]class AutoMaterializeRule(ABC):\n """An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\n should be kicked off for a given asset partition.\n\n Each rule can have one of two decision types, `MATERIALIZE` (indicating that an asset partition\n should be materialized) or `SKIP` (indicating that the asset partition should not be\n materialized).\n\n Materialize rules are evaluated first, and skip rules operate over the set of candidates that\n are produced by the materialize rules. Other than that, there is no ordering between rules.\n """\n\n @abstractproperty\n def decision_type(self) -> AutoMaterializeDecisionType:\n """The decision type of the rule (either `MATERIALIZE` or `SKIP`)."""\n ...\n\n @abstractproperty\n def description(self) -> str:\n """A human-readable description of this rule. As a basic guideline, this string should\n complete the sentence: 'Indicates an asset should be (materialize/skipped) when ____'.\n """\n ...\n\n @abstractmethod\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """The core evaluation function for the rule. This function takes in a context object and\n returns a mapping from evaluated rules to the set of asset partitions that the rule applies\n to.\n """\n ...\n\n
[docs] @public\n @staticmethod\n def materialize_on_required_for_freshness() -> "MaterializeOnRequiredForFreshnessRule":\n """Materialize an asset partition if it is required to satisfy a freshness policy of this\n asset or one of its downstream assets.\n\n Note: This rule has no effect on partitioned assets.\n """\n return MaterializeOnRequiredForFreshnessRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_parent_updated() -> "MaterializeOnParentUpdatedRule":\n """Materialize an asset partition if one of its parents has been updated more recently\n than it has.\n\n Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\n asset, this rule will only fire for the most recent partition of the downstream.\n """\n return MaterializeOnParentUpdatedRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_missing() -> "MaterializeOnMissingRule":\n """Materialize an asset partition if it has never been materialized before. This rule will\n not fire for non-root assets unless that asset's parents have been updated.\n """\n return MaterializeOnMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_missing() -> "SkipOnParentMissingRule":\n """Skip materializing an asset partition if one of its parent asset partitions has never\n been materialized (for regular assets) or observed (for observable source assets).\n """\n return SkipOnParentMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_outdated() -> "SkipOnParentOutdatedRule":\n """Skip materializing an asset partition if any of its parents has not incorporated the\n latest data from its ancestors.\n """\n return SkipOnParentOutdatedRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_not_all_parents_updated(\n require_update_for_all_parent_partitions: bool = False,\n ) -> "SkipOnNotAllParentsUpdatedRule":\n """Skip materializing an asset partition if any of its parents have not been updated since\n the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n return SkipOnNotAllParentsUpdatedRule(require_update_for_all_parent_partitions)
\n\n def to_snapshot(self) -> AutoMaterializeRuleSnapshot:\n """Returns a serializable snapshot of this rule for historical evaluations."""\n return AutoMaterializeRuleSnapshot.from_rule(self)\n\n def __eq__(self, other) -> bool:\n # override the default NamedTuple __eq__ method to factor in types\n return type(self) == type(other) and super().__eq__(other)\n\n def __hash__(self) -> int:\n # override the default NamedTuple __hash__ method to factor in types\n return hash(hash(type(self)) + super().__hash__())
\n\n\n@whitelist_for_serdes\nclass MaterializeOnRequiredForFreshnessRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnRequiredForFreshnessRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "required to meet this or downstream asset's freshness policy"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n freshness_conditions = freshness_evaluation_results_for_asset_key(\n asset_key=context.asset_key,\n data_time_resolver=context.data_time_resolver,\n asset_graph=context.asset_graph,\n current_time=context.instance_queryer.evaluation_time,\n will_materialize_mapping=context.will_materialize_mapping,\n expected_data_time_mapping=context.expected_data_time_mapping,\n )\n return freshness_conditions\n\n\n@whitelist_for_serdes\nclass MaterializeOnParentUpdatedRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnParentUpdatedRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "upstream data has changed since latest materialization"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions of this asset whose parents have been updated,\n or will update on this tick.\n """\n conditions = defaultdict(set)\n has_parents_that_will_update = set()\n\n # first, get the set of parents that will be materialized this tick, and see if we\n # can materialize this asset with those parents\n will_update_parents_by_asset_partition = defaultdict(set)\n for parent_key in context.asset_graph.get_parents(context.asset_key):\n if not context.materializable_in_same_run(context.asset_key, parent_key):\n continue\n for parent_partition in context.will_materialize_mapping.get(parent_key, set()):\n asset_partition = AssetKeyPartitionKey(\n context.asset_key, parent_partition.partition_key\n )\n will_update_parents_by_asset_partition[asset_partition].add(parent_key)\n has_parents_that_will_update.add(asset_partition)\n\n # next, for each asset partition of this asset which has newly-updated parents, or\n # has a parent that will update, create a ParentUpdatedRuleEvaluationData\n has_or_will_update = (\n context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n )\n | has_parents_that_will_update\n )\n for asset_partition in has_or_will_update:\n parent_asset_partitions = context.asset_graph.get_parents_partitions(\n dynamic_partitions_store=context.instance_queryer,\n current_time=context.instance_queryer.evaluation_time,\n asset_key=asset_partition.asset_key,\n partition_key=asset_partition.partition_key,\n ).parent_partitions\n\n updated_parent_asset_partitions = context.instance_queryer.get_parent_asset_partitions_updated_after_child(\n asset_partition,\n parent_asset_partitions,\n # do a precise check for updated parents, factoring in data versions, as long as\n # we're within reasonable limits on the number of partitions to check\n respect_materialization_data_versions=context.daemon_context.respect_materialization_data_versions\n and len(parent_asset_partitions | has_or_will_update) < 100,\n # ignore self-dependencies when checking for updated parents, to avoid historical\n # rematerializations from causing a chain of materializations to be kicked off\n ignored_parent_keys={context.asset_key},\n )\n updated_parents = {parent.asset_key for parent in updated_parent_asset_partitions}\n will_update_parents = will_update_parents_by_asset_partition[asset_partition]\n\n if updated_parents or will_update_parents:\n conditions[\n ParentUpdatedRuleEvaluationData(\n updated_asset_keys=frozenset(updated_parents),\n will_update_asset_keys=frozenset(will_update_parents),\n )\n ].add(asset_partition)\n if conditions:\n return [(k, v) for k, v in conditions.items()]\n return []\n\n\n@whitelist_for_serdes\nclass MaterializeOnMissingRule(AutoMaterializeRule, NamedTuple("_MaterializeOnMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "materialization is missing"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions for this asset which are missing and were not\n previously discarded. Currently only applies to root asset partitions and asset partitions\n with updated parents.\n """\n missing_asset_partitions = (\n context.daemon_context.get_never_handled_root_asset_partitions_for_key(\n context.asset_key\n )\n )\n # in addition to missing root asset partitions, check any asset partitions with updated\n # parents to see if they're missing\n for (\n candidate\n ) in context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n ):\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n candidate\n ):\n missing_asset_partitions |= {candidate}\n if missing_asset_partitions:\n return [(None, missing_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentOutdatedRule(AutoMaterializeRule, NamedTuple("_SkipOnParentOutdatedRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be up to date"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n outdated_ancestors = set()\n # find the root cause of why this asset partition's parents are outdated (if any)\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n outdated_ancestors.update(\n context.instance_queryer.get_outdated_ancestors(asset_partition=parent)\n )\n if outdated_ancestors:\n asset_partitions_by_waiting_on_asset_keys[frozenset(outdated_ancestors)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentMissingRule(AutoMaterializeRule, NamedTuple("_SkipOnParentMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be present"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n missing_parent_asset_keys = set()\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n # ignore non-observable sources, which will never have a materialization or observation\n if context.asset_graph.is_source(\n parent.asset_key\n ) and not context.asset_graph.is_observable(parent.asset_key):\n continue\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n parent\n ):\n missing_parent_asset_keys.add(parent.asset_key)\n if missing_parent_asset_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(missing_parent_asset_keys)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnNotAllParentsUpdatedRule(\n AutoMaterializeRule,\n NamedTuple(\n "_SkipOnNotAllParentsUpdatedRule", [("require_update_for_all_parent_partitions", bool)]\n ),\n):\n """An auto-materialize rule that enforces that an asset can only be materialized if all parents\n have been materialized since the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n if self.require_update_for_all_parent_partitions is False:\n return "waiting on upstream data to be updated"\n else:\n return "waiting until all upstream partitions are updated"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n parent_partitions = context.asset_graph.get_parents_partitions(\n context.instance_queryer,\n context.instance_queryer.evaluation_time,\n context.asset_key,\n candidate.partition_key,\n ).parent_partitions\n\n updated_parent_partitions = (\n context.instance_queryer.get_parent_asset_partitions_updated_after_child(\n candidate,\n parent_partitions,\n context.daemon_context.respect_materialization_data_versions,\n ignored_parent_keys=set(),\n )\n | set().union(\n *[\n context.will_materialize_mapping.get(parent, set())\n for parent in context.asset_graph.get_parents(context.asset_key)\n ]\n )\n )\n\n if self.require_update_for_all_parent_partitions:\n # All upstream partitions must be updated in order for the candidate to be updated\n non_updated_parent_keys = {\n parent.asset_key for parent in parent_partitions - updated_parent_partitions\n }\n else:\n # At least one upstream partition in each upstream asset must be updated in order\n # for the candidate to be updated\n parent_asset_keys = context.asset_graph.get_parents(context.asset_key)\n updated_parent_partitions_by_asset_key = context.get_asset_partitions_by_asset_key(\n updated_parent_partitions\n )\n non_updated_parent_keys = {\n parent\n for parent in parent_asset_keys\n if not updated_parent_partitions_by_asset_key.get(parent)\n }\n\n # do not require past partitions of this asset to be updated\n non_updated_parent_keys -= {context.asset_key}\n\n if non_updated_parent_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(non_updated_parent_keys)].add(\n candidate\n )\n\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass DiscardOnMaxMaterializationsExceededRule(\n AutoMaterializeRule, NamedTuple("_DiscardOnMaxMaterializationsExceededRule", [("limit", int)])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.DISCARD\n\n @property\n def description(self) -> str:\n return f"exceeds {self.limit} materialization(s) per minute"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n # the set of asset partitions which exceed the limit\n rate_limited_asset_partitions = set(\n sorted(\n context.candidates,\n key=lambda x: sort_key_for_asset_partition(context.asset_graph, x),\n )[self.limit :]\n )\n if rate_limited_asset_partitions:\n return [(None, rate_limited_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass AutoMaterializeAssetEvaluation(NamedTuple):\n """Represents the results of the auto-materialize logic for a single asset.\n\n Properties:\n asset_key (AssetKey): The asset key that was evaluated.\n partition_subsets_by_condition: The rule evaluations that impact if the asset should be\n materialized, skipped, or discarded. If the asset is partitioned, this will be a list of\n tuples, where the first element is the condition and the second element is the\n serialized subset of partitions that the condition applies to. If it's not partitioned,\n the second element will be None.\n """\n\n asset_key: AssetKey\n partition_subsets_by_condition: Sequence[\n Tuple["AutoMaterializeRuleEvaluation", Optional[SerializedPartitionsSubset]]\n ]\n num_requested: int\n num_skipped: int\n num_discarded: int\n run_ids: Set[str] = set()\n rule_snapshots: Optional[Sequence[AutoMaterializeRuleSnapshot]] = None\n\n @staticmethod\n def from_rule_evaluation_results(\n asset_graph: AssetGraph,\n asset_key: AssetKey,\n asset_partitions_by_rule_evaluation: Sequence[\n Tuple[AutoMaterializeRuleEvaluation, AbstractSet[AssetKeyPartitionKey]]\n ],\n num_requested: int,\n num_skipped: int,\n num_discarded: int,\n dynamic_partitions_store: "DynamicPartitionsStore",\n ) -> "AutoMaterializeAssetEvaluation":\n auto_materialize_policy = asset_graph.auto_materialize_policies_by_key.get(asset_key)\n\n if not auto_materialize_policy:\n check.failed(f"Expected auto materialize policy on asset {asset_key}")\n\n partitions_def = asset_graph.get_partitions_def(asset_key)\n if partitions_def is None:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (rule_evaluation, None)\n for rule_evaluation, _ in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n else:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (\n rule_evaluation,\n SerializedPartitionsSubset.from_subset(\n subset=partitions_def.empty_subset().with_partition_keys(\n check.not_none(ap.partition_key) for ap in asset_partitions\n ),\n partitions_def=partitions_def,\n dynamic_partitions_store=dynamic_partitions_store,\n ),\n )\n for rule_evaluation, asset_partitions in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n\n\n# BACKCOMPAT GRAVEYARD\n\n\nclass BackcompatAutoMaterializeConditionSerializer(NamedTupleSerializer):\n """This handles backcompat for the old AutoMaterializeCondition objects, turning them into the\n proper AutoMaterializeRuleEvaluation objects. This is necessary because old\n AutoMaterializeAssetEvaluation objects will have serialized AutoMaterializeCondition objects,\n and we need to be able to deserialize them.\n\n In theory, as these serialized objects happen to be purged periodically, we can remove this\n backcompat logic at some point in the future.\n """\n\n def unpack(\n self,\n unpacked_dict: Dict[str, UnpackedValue],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> AutoMaterializeRuleEvaluation:\n if self.klass in (\n FreshnessAutoMaterializeCondition,\n DownstreamFreshnessAutoMaterializeCondition,\n ):\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_required_for_freshness().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == MissingAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_missing().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == ParentMaterializedAutoMaterializeCondition:\n updated_asset_keys = unpacked_dict.get("updated_asset_keys")\n if isinstance(updated_asset_keys, set):\n updated_asset_keys = cast(FrozenSet[AssetKey], frozenset(updated_asset_keys))\n else:\n updated_asset_keys = frozenset()\n will_update_asset_keys = unpacked_dict.get("will_update_asset_keys")\n if isinstance(will_update_asset_keys, set):\n will_update_asset_keys = cast(\n FrozenSet[AssetKey], frozenset(will_update_asset_keys)\n )\n else:\n will_update_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_parent_updated().to_snapshot(),\n evaluation_data=ParentUpdatedRuleEvaluationData(\n updated_asset_keys=updated_asset_keys,\n will_update_asset_keys=will_update_asset_keys,\n ),\n )\n elif self.klass == ParentOutdatedAutoMaterializeCondition:\n waiting_on_asset_keys = unpacked_dict.get("waiting_on_asset_keys")\n if isinstance(waiting_on_asset_keys, set):\n waiting_on_asset_keys = cast(FrozenSet[AssetKey], frozenset(waiting_on_asset_keys))\n else:\n waiting_on_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.skip_on_parent_outdated().to_snapshot(),\n evaluation_data=WaitingOnAssetsRuleEvaluationData(\n waiting_on_asset_keys=waiting_on_asset_keys\n ),\n )\n elif self.klass == MaxMaterializationsExceededAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=DiscardOnMaxMaterializationsExceededRule(limit=1).to_snapshot(),\n evaluation_data=None,\n )\n check.failed(f"Unexpected class {self.klass}")\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass FreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass DownstreamFreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentMaterializedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MissingAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentOutdatedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MaxMaterializationsExceededAutoMaterializeCondition(NamedTuple): ...\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_rule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_rule"}, "backfill_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.backfill_policy

\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass BackfillPolicyType(Enum):\n    SINGLE_RUN = "SINGLE_RUN"\n    MULTI_RUN = "MULTI_RUN"\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass BackfillPolicy(\n NamedTuple(\n "_BackfillPolicy",\n [\n ("max_partitions_per_run", Optional[int]),\n ],\n )\n):\n """A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.\n\n There are two main kinds of backfill policies: single-run and multi-run.\n\n An asset with a single-run backfill policy will take a single run to backfill all of its\n partitions at once.\n\n An asset with a multi-run backfill policy will take multiple runs to backfill all of its\n partitions. Each run will backfill a subset of the partitions. The number of partitions to\n backfill in each run is controlled by the `max_partitions_per_run` parameter.\n\n For example:\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 10, then it will\n be backfilled in 10 runs; each run will backfill 10 partitions.\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 11, then it will\n be backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\n will backfill the remaining 9 partitions.\n\n **Warning:**\n\n Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\n BackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\n recommended APIs.\n """\n\n def __new__(cls, max_partitions_per_run: Optional[int] = 1):\n return super(BackfillPolicy, cls).__new__(\n cls,\n max_partitions_per_run=max_partitions_per_run,\n )\n\n
[docs] @public\n @staticmethod\n def single_run() -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in a single run."""\n return BackfillPolicy(max_partitions_per_run=None)
\n\n
[docs] @public\n @staticmethod\n def multi_run(max_partitions_per_run: int = 1) -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in multiple runs.\n Each run will backfill [max_partitions_per_run] number of partitions.\n\n Args:\n max_partitions_per_run (Optional[int]): The maximum number of partitions in each run of\n the multiple runs. Defaults to 1.\n """\n return BackfillPolicy(\n max_partitions_per_run=check.int_param(max_partitions_per_run, "max_partitions_per_run")\n )
\n\n @property\n def policy_type(self) -> BackfillPolicyType:\n if self.max_partitions_per_run:\n return BackfillPolicyType.MULTI_RUN\n else:\n return BackfillPolicyType.SINGLE_RUN
\n
", "current_page_name": "_modules/dagster/_core/definitions/backfill_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.backfill_policy"}, "config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.config

\nfrom typing import Any, Callable, Mapping, NamedTuple, Optional, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    ConfigType,\n    is_supported_config_python_builtin,\n    process_config,\n    resolve_defaults,\n    validate_config,\n)\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nConfigMappingFn: TypeAlias = Callable[[Any], Any]\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Mapping[str, object]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the graph to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: ConfigMappingFn,\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/_core/definitions/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.config"}, "configurable": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, NamedTuple, Optional, Type, TypeVar, Union, cast\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    Field,\n    _check as check,\n)\nfrom dagster._config import EvaluateValueResult\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params\n\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and Graph config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: CoercableToConfigSchema = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self:\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[UserConfigSchema] = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self: ...\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> None:\n    from dagster._core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which"\n        " is produced by aliasing or tagging a node definition. To configure a node, you must"\n        " call `configured` on either an OpDefinition and GraphDefinition. To fix"\n        " this error, make sure to call `configured` on the definition object *before* using"\n        " the `tag` or `alias` methods. For usage examples, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        "Only the following types can be used with the `configured` method: ResourceDefinition,"\n        " ExecutorDefinition, GraphDefinition, NodeDefinition, and LoggerDefinition."\n        " For usage examples of `configured`, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n\n\nT_Configurable = TypeVar(\n    "T_Configurable", bound=Union["AnonymousConfigurableDefinition", "NamedConfigurableDefinition"]\n)\n\n\nclass FunctionAndConfigSchema(NamedTuple):\n    function: Callable[[Any], Any]\n    config_schema: Optional[UserConfigSchema]\n\n\ndef _wrap_user_fn_if_pythonic_config(\n    user_fn: Any, config_schema: Optional[UserConfigSchema]\n) -> FunctionAndConfigSchema:\n    """Helper function which allows users to provide a Pythonic config object to a @configurable\n    function. Detects if the function has a single parameter annotated with a Config class.\n    If so, wraps the function to convert the config dictionary into the appropriate Config object.\n    """\n    from dagster._config.pythonic_config import (\n        Config,\n        infer_schema_from_config_annotation,\n        safe_is_subclass,\n    )\n\n    if not isinstance(user_fn, Callable):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    config_fn_params = get_function_params(user_fn)\n    check.invariant(\n        len(config_fn_params) == 1, "@configured function should have exactly one parameter"\n    )\n\n    param = config_fn_params[0]\n\n    # If the parameter is a subclass of Config, we can infer the config schema from the\n    # type annotation. We'll also wrap the config mapping function to convert the config\n    # dictionary into the appropriate Config object.\n    if not safe_is_subclass(param.annotation, Config):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    check.invariant(\n        config_schema is None,\n        "Cannot provide config_schema to @configured function with Config-annotated param",\n    )\n\n    config_schema_from_class = infer_schema_from_config_annotation(param.annotation, param.default)\n    config_cls = cast(Type[Config], param.annotation)\n\n    param_name = param.name\n\n    def wrapped_fn(config_as_dict) -> Any:\n        config_input = config_cls(**config_as_dict)\n        output = user_fn(**{param_name: config_input})\n\n        if isinstance(output, Config):\n            return output._convert_to_config_dictionary()  # noqa: SLF001\n        else:\n            return output\n\n    return FunctionAndConfigSchema(function=wrapped_fn, config_schema=config_schema_from_class)\n\n\n
[docs]def configured(\n configurable: T_Configurable,\n config_schema: Optional[UserConfigSchema] = None,\n **kwargs: Any,\n) -> Callable[[object], T_Configurable]:\n """A decorator that makes it easy to create a function-configured version of an object.\n\n The following definition types can be configured using this function:\n\n * :py:class:`GraphDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`OpDefinition`\n\n Using ``configured`` may result in config values being displayed in the Dagster UI,\n so it is not recommended to use this API with sensitive values, such as\n secrets.\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy. Alternatively, annotate the config parameter to the decorated function\n with a subclass of :py:class:`Config` and omit this argument.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n class GreetingConfig(Config):\n message: str\n\n @op\n def greeting_op(config: GreetingConfig):\n print(config.message)\n\n class HelloConfig(Config):\n name: str\n\n @configured(greeting_op)\n def hello_op(config: HelloConfig):\n return GreetingConfig(message=f"Hello, {config.name}!")\n\n .. code-block:: python\n\n dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(S3Resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(S3Resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n\n """\n _check_configurable_param(configurable)\n\n if isinstance(configurable, NamedConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n fn_name = (\n getattr(config_or_config_fn, "__name__", None)\n if callable(config_or_config_fn)\n else None\n )\n name: str = check.not_none(kwargs.get("name") or fn_name)\n\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_or_config_fn=updated_fn,\n name=name,\n config_schema=new_config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n elif isinstance(configurable, AnonymousConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_schema=new_config_schema, config_or_config_fn=updated_fn, **kwargs\n )\n\n return _configured\n else:\n check.failed(f"Invalid configurable definition type: {type(configurable)}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/configurable", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.configurable"}, "decorators": {"asset_check_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_check_decorator

\nfrom typing import Any, Callable, Mapping, Optional, Set, Tuple, Union, cast\n\nfrom dagster import _check as check\nfrom dagster._annotations import experimental\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import (\n    AssetChecksDefinition,\n    AssetChecksDefinitionInputOutputProps,\n)\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import NoValueSentinel\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..input import In\nfrom .asset_decorator import (\n    get_function_params_without_context_or_config_or_resources,\n    stringify_asset_key_to_input_name,\n)\nfrom .op_decorator import _Op\n\nAssetCheckFunctionReturn = AssetCheckResult\nAssetCheckFunction = Callable[..., AssetCheckFunctionReturn]\n\n\ndef _build_asset_check_input(\n    name: str, asset_key: AssetKey, fn: Callable\n) -> Mapping[AssetKey, Tuple[str, In]]:\n    asset_params = get_function_params_without_context_or_config_or_resources(fn)\n\n    if len(asset_params) == 0:\n        input_name = stringify_asset_key_to_input_name(asset_key)\n        in_def = In(cast(type, Nothing))\n    elif len(asset_params) == 1:\n        input_name = asset_params[0].name\n        in_def = In(metadata={}, input_manager_key=None, dagster_type=NoValueSentinel)\n    else:\n        raise DagsterInvalidDefinitionError(\n            f"When defining check '{name}', multiple target assets provided as parameters:"\n            f" {[param.name for param in asset_params]}. Only one"\n            " is allowed."\n        )\n\n    return {\n        asset_key: (\n            input_name,\n            in_def,\n        )\n    }\n\n\n
[docs]@experimental\ndef asset_check(\n *,\n asset: Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset],\n name: Optional[str] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n compute_kind: Optional[str] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Callable[[AssetCheckFunction], AssetChecksDefinition]:\n """Create a definition for how to execute an asset check.\n\n Args:\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The\n asset that the check applies to.\n name (Optional[str]): The name of the check. If not specified, the name of the decorated\n function will be used. Checks for the same asset must have unique names.\n description (Optional[str]): The description of the check.\n required_resource_keys (Optional[Set[str]]): A set of keys for resources that are required\n by the function that execute the check. These can alternatively be specified by\n including resource-typed parameters in the function signature.\n config_schema (Optional[ConfigSchema): The configuration schema for the check's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that executes the check.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n compute_kind (Optional[str]): A string to represent the kind of computation that executes\n the check, e.g. "dbt" or "spark".\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that executes the check.\n\n\n Produces an :py:class:`AssetChecksDefinition` object.\n\n\n Example:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n\n @asset\n def my_asset() -> None:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows() -> AssetCheckResult:\n num_rows = ...\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n\n\n Example with a DataFrame Output:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n from pandas import DataFrame\n\n @asset\n def my_asset() -> DataFrame:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n num_rows = my_asset.shape[0]\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n """\n\n def inner(fn: AssetCheckFunction) -> AssetChecksDefinition:\n check.callable_param(fn, "fn")\n resolved_name = name or fn.__name__\n asset_key = AssetKey.from_coercible_or_definition(asset)\n\n out = Out(dagster_type=None)\n input_tuples_by_asset_key = _build_asset_check_input(resolved_name, asset_key, fn)\n if len(input_tuples_by_asset_key) == 0:\n raise DagsterInvalidDefinitionError(\n f"No target asset provided when defining check '{resolved_name}'"\n )\n\n if len(input_tuples_by_asset_key) > 1:\n raise DagsterInvalidDefinitionError(\n f"When defining check '{resolved_name}', Multiple target assets provided:"\n f" {[key.to_user_string() for key in input_tuples_by_asset_key.keys()]}. Only one"\n " is allowed."\n )\n\n resolved_asset_key = next(iter(input_tuples_by_asset_key.keys()))\n spec = AssetCheckSpec(\n name=resolved_name,\n description=description,\n asset=resolved_asset_key,\n )\n\n op_def = _Op(\n name=spec.get_python_identifier(),\n ins=dict(input_tuples_by_asset_key.values()),\n out=out,\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=config_schema,\n retry_policy=retry_policy,\n )(fn)\n\n checks_def = AssetChecksDefinition(\n node_def=op_def,\n resource_defs={},\n specs=[spec],\n input_output_props=AssetChecksDefinitionInputOutputProps(\n asset_keys_by_input_name={\n input_tuples_by_asset_key[resolved_asset_key][0]: resolved_asset_key\n },\n asset_check_keys_by_output_name={op_def.output_defs[0].name: spec.key},\n ),\n )\n\n return checks_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_check_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_check_decorator"}, "asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_decorator

\nfrom collections import Counter\nfrom inspect import Parameter\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, experimental_param\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping, MetadataUserInput\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom ..asset_check_spec import AssetCheckSpec\nfrom ..asset_in import AssetIn\nfrom ..asset_out import AssetOut\nfrom ..asset_spec import AssetSpec\nfrom ..assets import AssetsDefinition\nfrom ..backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom ..decorators.graph_decorator import graph\nfrom ..decorators.op_decorator import _Op\nfrom ..events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom ..input import GraphIn, In\nfrom ..output import GraphOut, Out\nfrom ..partition import PartitionsDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, NoValueSentinel\n\n\n@overload\ndef asset(\n    compute_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef asset(\n    *,\n    name: Optional[str] = ...,\n    key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    deps: Optional[Iterable[CoercibleToAssetDep]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[UserConfigSchema] = None,\n    required_resource_keys: Optional[Set[str]] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    io_manager_def: Optional[object] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    op_tags: Optional[Mapping[str, Any]] = ...,\n    group_name: Optional[str] = ...,\n    output_required: bool = ...,\n    freshness_policy: Optional[FreshnessPolicy] = ...,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n    backfill_policy: Optional[BackfillPolicy] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n    key: Optional[CoercibleToAssetKey] = None,\n    non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = ...,\n    check_specs: Optional[Sequence[AssetCheckSpec]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\n@experimental_param(param="auto_materialize_policy")\n@experimental_param(param="backfill_policy")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef asset(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_def: Optional[object] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n An asset has an op inside it to represent the function that computes it. The name of the op\n will be the segments of the asset key, separated by double-underscores.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in dagster (ie only contains\n letters, numbers, and _) and may not contain python reserved keywords.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used\n for storing the output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager"). Only one of io_manager_key and io_manager_def can be provided.\n io_manager_def (Optional[object]): (Experimental) The IOManager used for\n storing the output of the op as an asset, and for loading it in\n downstream ops. Only one of io_manager_def and io_manager_key can be provided.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n output_required (bool): Whether the decorated function will always materialize an asset.\n Defaults to True. If False, the function can return None, which will not be materialized to\n storage and will halt execution of downstream assets.\n freshness_policy (FreshnessPolicy): A constraint telling Dagster how often this asset is intended to be updated\n with respect to its root data.\n auto_materialize_policy (AutoMaterializePolicy): (Experimental) Configure Dagster to automatically materialize\n this asset according to its FreshnessPolicy and when upstream dependencies change.\n backfill_policy (BackfillPolicy): (Experimental) Configure Dagster to backfill this asset according to its\n BackfillPolicy.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code that generates this asset. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the asset.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead.\n Set of asset keys that are upstream dependencies, but do not pass an input to the asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n\n def create_asset():\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n key_prefix=key_prefix,\n ins=ins,\n deps=upstream_asset_deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n io_manager_key=io_manager_key,\n io_manager_def=io_manager_def,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n op_tags=op_tags,\n group_name=group_name,\n output_required=output_required,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n retry_policy=retry_policy,\n code_version=code_version,\n check_specs=check_specs,\n key=key,\n )\n\n if compute_fn is not None:\n return create_asset()(compute_fn)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n check.invariant(\n not (io_manager_key and io_manager_def),\n "Both io_manager_key and io_manager_def were provided to `@asset` decorator. Please"\n " provide one or the other. ",\n )\n return create_asset()(fn)\n\n return inner
\n\n\ndef _resolve_key_and_name(\n *,\n key: Optional[CoercibleToAssetKey],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n name: Optional[str],\n decorator: str,\n fn: Callable[..., Any],\n) -> Tuple[AssetKey, str]:\n if (name or key_prefix) and key:\n raise DagsterInvalidDefinitionError(\n f"Cannot specify a name or key prefix for {decorator} when the key"\n " argument is provided."\n )\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n key = AssetKey.from_coercible(key) if key else None\n assigned_name = name or fn.__name__\n return (\n (\n # the filter here appears unnecessary per typing, but this exists\n # historically so keeping it here to be conservative in case users\n # can get Nones into the key_prefix_list somehow\n AssetKey(list(filter(None, [*(key_prefix_list or []), assigned_name])))\n if not key\n else key\n ),\n assigned_name,\n )\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[AssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ):\n self.name = name\n self.key_prefix = key_prefix\n self.ins = ins or {}\n self.deps = deps or []\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self.io_manager_key = io_manager_key\n self.io_manager_def = io_manager_def\n self.config_schema = config_schema\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.op_tags = op_tags\n self.resource_defs = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n self.group_name = group_name\n self.output_required = output_required\n self.freshness_policy = freshness_policy\n self.retry_policy = retry_policy\n self.auto_materialize_policy = auto_materialize_policy\n self.backfill_policy = backfill_policy\n self.code_version = code_version\n self.check_specs = check_specs\n self.key = key\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n from dagster._config.pythonic_config import (\n validate_resource_annotated_function,\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n validate_resource_annotated_function(fn)\n\n asset_ins = build_asset_ins(fn, self.ins or {}, {dep.asset_key for dep in self.deps})\n\n out_asset_key, asset_name = _resolve_key_and_name(\n key=self.key,\n key_prefix=self.key_prefix,\n name=self.name,\n fn=fn,\n decorator="@asset",\n )\n\n with disable_dagster_warnings():\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n\n bare_required_resource_keys = set(self.required_resource_keys)\n\n resource_defs_dict = self.resource_defs\n resource_defs_keys = set(resource_defs_dict.keys())\n decorator_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n io_manager_key = self.io_manager_key\n if self.io_manager_def:\n if not io_manager_key:\n io_manager_key = out_asset_key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in self.resource_defs\n and self.resource_defs[io_manager_key] != self.io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = self.io_manager_def\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n check.param_invariant(\n len(bare_required_resource_keys) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @asset decorator and as arguments"\n " to the decorated function",\n )\n\n io_manager_key = cast(str, io_manager_key) if io_manager_key else DEFAULT_IO_MANAGER_KEY\n\n out = Out(\n metadata=self.metadata or {},\n io_manager_key=io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n description=self.description,\n is_required=self.output_required,\n code_version=self.code_version,\n )\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n self.check_specs, [out_asset_key]\n )\n check_outs: Mapping[str, Out] = {\n output_name: Out(dagster_type=None)\n for output_name in check_specs_by_output_name.keys()\n }\n\n op_required_resource_keys = decorator_resource_keys - arg_resource_keys\n\n op = _Op(\n name=out_asset_key.to_python_identifier(),\n description=self.description,\n ins=dict(asset_ins.values()),\n out={DEFAULT_OUTPUT: out, **check_outs},\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": self.compute_kind} if self.compute_kind else {}),\n **(self.op_tags or {}),\n },\n config_schema=self.config_schema,\n retry_policy=self.retry_policy,\n code_version=self.code_version,\n )(fn)\n\n # check backfill policy is BackfillPolicyType.SINGLE_RUN for non-partitioned asset\n if self.partitions_def is None:\n check.param_invariant(\n (\n self.backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if self.backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in self.ins.items()\n if asset_in.partition_mapping is not None\n }\n\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=self.deps, asset_name=asset_name\n )\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n node_def=op,\n partitions_def=self.partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n resource_defs=wrapped_resource_defs,\n group_names_by_key={out_asset_key: self.group_name} if self.group_name else None,\n freshness_policies_by_key=(\n {out_asset_key: self.freshness_policy} if self.freshness_policy else None\n ),\n auto_materialize_policies_by_key=(\n {out_asset_key: self.auto_materialize_policy}\n if self.auto_materialize_policy\n else None\n ),\n backfill_policy=self.backfill_policy,\n asset_deps=None, # no asset deps in single-asset decorator\n selected_asset_keys=None, # no subselection in decorator\n can_subset=False,\n metadata_by_key={out_asset_key: self.metadata} if self.metadata else None,\n # see comment in @multi_asset's call to dagster_internal_init for the gory details\n # this is best understood as an _override_ which @asset does not support\n descriptions_by_key=None,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n\n
[docs]@experimental_param(param="resource_defs")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef multi_asset(\n *,\n outs: Optional[Mapping[str, AssetOut]] = None,\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_name: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n specs: Optional[Sequence[AssetSpec]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n # deprecated\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\n on an individual asset within the multi-asset by attaching them to the :py:class:`AssetOut`\n corresponding to that asset in the `outs` parameter.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\n this function. AssetOuts detail the output, IO management, and core asset properties.\n This argument is required except when AssetSpecs are used.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the underlying op.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the op that computes the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n can_subset (bool): If this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the multi-asset. If set,\n this is used as a default code version for all defined assets.\n specs (Optional[Sequence[AssetSpec]]): (Experimental) The specifications for the assets materialized\n by this function.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the assets.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are upstream\n dependencies, but do not pass an input to the multi_asset.\n\n Examples:\n .. code-block:: python\n\n # Use IO managers to handle I/O:\n @multi_asset(\n outs={\n "my_string_asset": AssetOut(),\n "my_int_asset": AssetOut(),\n }\n )\n def my_function(upstream_asset: int):\n result = upstream_asset + 1\n return str(result), result\n\n # Handle I/O on your own:\n @multi_asset(\n outs={\n "asset1": AssetOut(),\n "asset2": AssetOut(),\n },\n deps=["asset0"],\n )\n def my_function():\n asset0_value = load(path="asset0")\n asset1_result, asset2_result = do_some_transformation(asset0_value)\n write(asset1_result, path="asset1")\n write(asset2_result, path="asset2")\n return None, None\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n specs = check.opt_list_param(specs, "specs", of_type=AssetSpec)\n\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs", key_type=str)\n )\n\n _config_schema = check.opt_mapping_param(\n config_schema, # type: ignore\n "config_schema",\n additional_message="Only dicts are supported for asset config_schema.",\n )\n\n bare_required_resource_keys = set(required_resource_keys)\n resource_defs_keys = set(resource_defs.keys())\n required_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n asset_out_map: Mapping[str, AssetOut] = {} if outs is None else outs\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n op_name = name or fn.__name__\n\n if asset_out_map and specs:\n raise DagsterInvalidDefinitionError("Must specify only outs or specs but not both.")\n elif specs:\n output_tuples_by_asset_key = {}\n for asset_spec in specs:\n # output names are asset keys joined with _\n output_name = "_".join(asset_spec.key.path)\n output_tuples_by_asset_key[asset_spec.key] = (\n output_name,\n Out(\n Nothing,\n is_required=not (can_subset or asset_spec.skippable),\n description=asset_spec.description,\n ),\n )\n if upstream_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass deps and specs to @multi_asset, specify deps on the AssetSpecs"\n " directly."\n )\n if internal_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass internal_asset_deps and specs to @multi_asset, specify deps on"\n " the AssetSpecs directly."\n )\n\n upstream_keys = set()\n for spec in specs:\n for dep in spec.deps:\n if dep.asset_key not in output_tuples_by_asset_key:\n upstream_keys.add(dep.asset_key)\n if (\n dep.asset_key in output_tuples_by_asset_key\n and dep.partition_mapping is not None\n ):\n # self-dependent asset also needs to be considered an upstream_key\n upstream_keys.add(dep.asset_key)\n\n explicit_ins = ins or {}\n # get which asset keys have inputs set\n loaded_upstreams = build_asset_ins(fn, explicit_ins, deps=set())\n unexpected_upstreams = {\n key for key in loaded_upstreams.keys() if key not in upstream_keys\n }\n if unexpected_upstreams:\n raise DagsterInvalidDefinitionError(\n f"Asset inputs {unexpected_upstreams} do not have dependencies on the passed"\n " AssetSpec(s). Set the deps on the appropriate AssetSpec(s)."\n )\n remaining_upstream_keys = {key for key in upstream_keys if key not in loaded_upstreams}\n asset_ins = build_asset_ins(fn, explicit_ins, deps=remaining_upstream_keys)\n else:\n asset_ins = build_asset_ins(\n fn,\n ins or {},\n deps=(\n {dep.asset_key for dep in upstream_asset_deps} if upstream_asset_deps else set()\n ),\n )\n output_tuples_by_asset_key = build_asset_outs(asset_out_map)\n # validate that the asset_deps make sense\n valid_asset_deps = set(asset_ins.keys()) | set(output_tuples_by_asset_key.keys())\n for out_name, asset_keys in asset_deps.items():\n if asset_out_map and out_name not in asset_out_map:\n check.failed(\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument"\n f" for multi-asset {op_name}. Must be one of the outs for this multi-asset"\n f" {list(asset_out_map.keys())[:20]}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in"\n f" `internal_asset_deps` argument for multi-asset '{op_name}' on key"\n f" '{out_name}'. Each specified asset key must be associated with an input to"\n " the asset or produced by this asset. Valid keys:"\n f" {list(valid_asset_deps)[:20]}",\n )\n\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n check.param_invariant(\n len(bare_required_resource_keys or []) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @multi_asset decorator and as"\n " arguments to the decorated function",\n )\n\n asset_outs_by_output_name: Mapping[str, Out] = dict(output_tuples_by_asset_key.values())\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(output_tuples_by_asset_key.keys())\n )\n check_outs_by_output_name: Mapping[str, Out] = {\n output_name: Out(dagster_type=None, is_required=not can_subset)\n for output_name in check_specs_by_output_name.keys()\n }\n overlapping_output_names = (\n asset_outs_by_output_name.keys() & check_outs_by_output_name.keys()\n )\n check.invariant(\n len(overlapping_output_names) == 0,\n f"Check output names overlap with asset output names: {overlapping_output_names}",\n )\n combined_outs_by_output_name: Mapping[str, Out] = {\n **asset_outs_by_output_name,\n **check_outs_by_output_name,\n }\n\n with disable_dagster_warnings():\n op_required_resource_keys = required_resource_keys - arg_resource_keys\n\n op = _Op(\n name=op_name,\n description=description,\n ins=dict(asset_ins.values()),\n out=combined_outs_by_output_name,\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=_config_schema,\n retry_policy=retry_policy,\n code_version=code_version,\n )(fn)\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n keys_by_output_name = {\n output_name: asset_key\n for asset_key, (output_name, _) in output_tuples_by_asset_key.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping is not None\n }\n\n if upstream_asset_deps:\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=upstream_asset_deps, asset_name=op_name\n )\n\n if specs:\n internal_deps = {\n spec.key: {dep.asset_key for dep in spec.deps}\n for spec in specs\n if spec.deps is not None\n }\n props_by_asset_key: Mapping[AssetKey, Union[AssetSpec, AssetOut]] = {\n spec.key: spec for spec in specs\n }\n # Add PartitionMappings specified via AssetSpec.deps to partition_mappings dictionary. Error on duplicates\n for spec in specs:\n for dep in spec.deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" multi_asset {op_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n else:\n internal_deps = {keys_by_output_name[name]: asset_deps[name] for name in asset_deps}\n props_by_asset_key = {\n keys_by_output_name[output_name]: asset_out\n for output_name, asset_out in asset_out_map.items()\n }\n\n # handle properties defined ons AssetSpecs or AssetOuts\n group_names_by_key = {\n asset_key: props.group_name\n for asset_key, props in props_by_asset_key.items()\n if props.group_name is not None\n }\n if group_name:\n check.invariant(\n not group_names_by_key,\n "Cannot set group_name parameter on multi_asset if one or more of the"\n " AssetSpecs/AssetOuts supplied to this multi_asset have a group_name defined.",\n )\n group_names_by_key = {asset_key: group_name for asset_key in props_by_asset_key}\n\n freshness_policies_by_key = {\n asset_key: props.freshness_policy\n for asset_key, props in props_by_asset_key.items()\n if props.freshness_policy is not None\n }\n auto_materialize_policies_by_key = {\n asset_key: props.auto_materialize_policy\n for asset_key, props in props_by_asset_key.items()\n if props.auto_materialize_policy is not None\n }\n metadata_by_key = {\n asset_key: props.metadata\n for asset_key, props in props_by_asset_key.items()\n if props.metadata is not None\n }\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=op,\n asset_deps=internal_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n selected_asset_keys=None, # no subselection in decorator\n # descriptions by key is more accurately understood as _overriding_ the descriptions\n # by key that are in the OutputDefinitions associated with the asset key.\n # This is a dangerous construction liable for bugs. Instead there should be a\n # canonical source of asset descriptions in AssetsDefinintion and if we need\n # to create a memoized cached dictionary of asset keys for perf or something we do\n # that in the `__init__` or on demand.\n #\n # This is actually an override. We do not override descriptions\n # in OutputDefinitions in @multi_asset\n descriptions_by_key=None,\n metadata_by_key=metadata_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n return inner
\n\n\ndef get_function_params_without_context_or_config_or_resources(fn: Callable) -> List[Parameter]:\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_params = params[1:] if is_context_provided else params\n\n resource_arg_names = {arg.name for arg in get_resource_args(fn)}\n\n new_input_args = []\n for input_arg in input_params:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n\n return new_input_args\n\n\ndef stringify_asset_key_to_input_name(asset_key: AssetKey) -> str:\n return "_".join(asset_key.path).replace("-", "_")\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_ins: Mapping[str, AssetIn],\n deps: Optional[AbstractSet[AssetKey]],\n) -> Mapping[AssetKey, Tuple[str, In]]:\n """Creates a mapping from AssetKey to (name of input, In object)."""\n deps = check.opt_set_param(deps, "deps", AssetKey)\n\n new_input_args = get_function_params_without_context_or_config_or_resources(fn)\n\n non_var_input_param_names = [\n param.name for param in new_input_args if param.kind == Parameter.POSITIONAL_OR_KEYWORD\n ]\n has_kwargs = any(param.kind == Parameter.VAR_KEYWORD for param in new_input_args)\n\n all_input_names = set(non_var_input_param_names) | asset_ins.keys()\n\n if not has_kwargs:\n for in_key, asset_in in asset_ins.items():\n if in_key not in non_var_input_param_names and (\n not isinstance(asset_in.dagster_type, DagsterType)\n or not asset_in.dagster_type.is_nothing\n ):\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins_by_asset_key: Dict[AssetKey, Tuple[str, In]] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].key\n metadata = asset_ins[input_name].metadata or {}\n key_prefix = asset_ins[input_name].key_prefix\n input_manager_key = asset_ins[input_name].input_manager_key\n dagster_type = asset_ins[input_name].dagster_type\n else:\n metadata = {}\n key_prefix = None\n input_manager_key = None\n dagster_type = NoValueSentinel\n\n asset_key = asset_key or AssetKey(list(filter(None, [*(key_prefix or []), input_name])))\n\n ins_by_asset_key[asset_key] = (\n input_name.replace("-", "_"),\n In(metadata=metadata, input_manager_key=input_manager_key, dagster_type=dagster_type),\n )\n\n for asset_key in deps:\n if asset_key in ins_by_asset_key:\n raise DagsterInvalidDefinitionError(\n f"deps value {asset_key} also declared as input/AssetIn"\n )\n # mypy doesn't realize that Nothing is a valid type here\n ins_by_asset_key[asset_key] = (\n stringify_asset_key_to_input_name(asset_key),\n In(cast(type, Nothing)),\n )\n\n return ins_by_asset_key\n\n\n@overload\ndef graph_asset(\n compose_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef graph_asset(\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = ...,\n freshness_policy: Optional[FreshnessPolicy] = ...,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n backfill_policy: Optional[BackfillPolicy] = ...,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = ...,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]def graph_asset(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Creates a software-defined asset that's computed using a graph of ops.\n\n This decorator is meant to decorate a function that composes a set of ops or graphs to define\n the dependencies between them.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in Dagster (ie only contains\n letters, numbers, and underscores) and may not contain Python reserved keywords.\n description (Optional[str]):\n A human-readable description of the asset.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph underlying the asset is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\n contains letters, numbers, and underscores) and may not contain Python reserved keywords.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n metadata (Optional[MetadataUserInput]): Dictionary of metadata to be associated with\n the asset.\n freshness_policy (Optional[FreshnessPolicy]): A constraint telling Dagster how often this asset is\n intended to be updated with respect to its root data.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): The AutoMaterializePolicy to use\n for this asset.\n backfill_policy (Optional[BackfillPolicy]): The BackfillPolicy to use for this asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @op\n def fetch_files_from_slack(context) -> pd.DataFrame:\n ...\n\n @op\n def store_files_in_table(files) -> None:\n files.to_sql(name="slack_files", con=create_db_connection())\n\n @graph_asset\n def slack_files_table():\n return store_files(fetch_files_from_slack())\n """\n if compose_fn is None:\n return lambda fn: graph_asset( # type: ignore # (decorator pattern)\n fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )\n else:\n return graph_asset_no_defaults(\n compose_fn=compose_fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )
\n\n\ndef graph_asset_no_defaults(\n *,\n compose_fn: Callable,\n name: Optional[str],\n description: Optional[str],\n ins: Optional[Mapping[str, AssetIn]],\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n group_name: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n metadata: Optional[MetadataUserInput],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n check_specs: Optional[Sequence[AssetCheckSpec]],\n key: Optional[CoercibleToAssetKey],\n) -> AssetsDefinition:\n ins = ins or {}\n asset_ins = build_asset_ins(compose_fn, ins or {}, set())\n out_asset_key, _asset_name = _resolve_key_and_name(\n key=key,\n key_prefix=key_prefix,\n name=name,\n decorator="@graph_asset",\n fn=compose_fn,\n )\n\n keys_by_input_name = {input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()}\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in ins.items()\n if asset_in.partition_mapping\n }\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, [out_asset_key]\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name: Mapping = {\n "result": GraphOut(),\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=out_asset_key.to_python_identifier(),\n description=description,\n config=config,\n ins={input_name: GraphIn() for _, (input_name, _) in asset_ins.items()},\n out=combined_outs_by_output_name,\n )(compose_fn)\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n metadata_by_output_name={"result": metadata} if metadata else None,\n freshness_policies_by_output_name=(\n {"result": freshness_policy} if freshness_policy else None\n ),\n auto_materialize_policies_by_output_name=(\n {"result": auto_materialize_policy} if auto_materialize_policy else None\n ),\n backfill_policy=backfill_policy,\n descriptions_by_output_name={"result": description} if description else None,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n\n
[docs]def graph_multi_asset(\n *,\n outs: Mapping[str, AssetOut],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n group_name: Optional[str] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same graph of\n ops, and the same upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the graph.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the asset.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n can_subset (bool): Whether this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n """\n\n def inner(fn: Callable) -> AssetsDefinition:\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping\n }\n\n asset_ins = build_asset_ins(fn, ins or {}, set())\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n asset_outs = build_asset_outs(outs)\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(asset_outs.keys())\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name = {\n **{output_name: GraphOut() for output_name, _ in asset_outs.values()},\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=name or fn.__name__,\n out=combined_outs_by_output_name,\n )(fn)\n\n # source metadata from the AssetOuts (if any)\n metadata_by_output_name = {\n output_name: out.metadata\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.metadata is not None\n }\n\n # source freshness policies from the AssetOuts (if any)\n freshness_policies_by_output_name = {\n output_name: out.freshness_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.freshness_policy is not None\n }\n\n # source auto materialize policies from the AssetOuts (if any)\n auto_materialize_policies_by_output_name = {\n output_name: out.auto_materialize_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.auto_materialize_policy is not None\n }\n\n # source descriptions from the AssetOuts (if any)\n descriptions_by_output_name = {\n output_name: out.description\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.description is not None\n }\n\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={\n output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n },\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n can_subset=can_subset,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n descriptions_by_output_name=descriptions_by_output_name,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n return inner
\n\n\ndef build_asset_outs(asset_outs: Mapping[str, AssetOut]) -> Mapping[AssetKey, Tuple[str, Out]]:\n """Creates a mapping from AssetKey to (name of output, Out object)."""\n outs_by_asset_key: Dict[AssetKey, Tuple[str, Out]] = {}\n for output_name, asset_out in asset_outs.items():\n out = asset_out.to_out()\n asset_key = asset_out.key or AssetKey(\n list(filter(None, [*(asset_out.key_prefix or []), output_name]))\n )\n\n outs_by_asset_key[asset_key] = (output_name.replace("-", "_"), out)\n\n return outs_by_asset_key\n\n\ndef _deps_and_non_argument_deps_to_asset_deps(\n deps: Optional[Iterable[CoercibleToAssetDep]],\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]],\n) -> Optional[Iterable[AssetDep]]:\n """Helper function for managing deps and non_argument_deps while non_argument_deps is still an accepted parameter.\n Ensures only one of deps and non_argument_deps is provided, then converts the deps to AssetDeps.\n """\n if non_argument_deps is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and non_argument_deps to @asset. Use only deps instead."\n )\n\n if deps is not None:\n return _make_asset_deps(deps)\n\n if non_argument_deps is not None:\n check.set_param(non_argument_deps, "non_argument_deps", of_type=(AssetKey, str))\n return _make_asset_deps(non_argument_deps)\n\n\ndef _make_asset_deps(deps: Optional[Iterable[CoercibleToAssetDep]]) -> Optional[Iterable[AssetDep]]:\n if deps is None:\n return None\n\n # expand any multi_assets into a list of keys\n all_deps = []\n for dep in deps:\n if isinstance(dep, AssetsDefinition) and len(dep.keys) > 1:\n all_deps.extend(dep.keys)\n else:\n all_deps.append(dep)\n\n with disable_dagster_warnings():\n dep_dict = {}\n for dep in all_deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys. If an asset is specified as a dependency more than once, only error if the\n # dependency is different (ie has a different PartitionMapping)\n if (\n asset_dep.asset_key in dep_dict.keys()\n and asset_dep != dep_dict[asset_dep.asset_key]\n ):\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once per"\n " asset."\n )\n dep_dict[asset_dep.asset_key] = asset_dep\n\n return list(dep_dict.values())\n\n\ndef _validate_and_assign_output_names_to_check_specs(\n check_specs: Optional[Sequence[AssetCheckSpec]], valid_asset_keys: Sequence[AssetKey]\n) -> Mapping[str, AssetCheckSpec]:\n check_specs_by_output_name = {spec.get_python_identifier(): spec for spec in check_specs or []}\n if check_specs and len(check_specs_by_output_name) != len(check_specs):\n duplicates = {\n item: count\n for item, count in Counter(\n [(spec.asset_key, spec.name) for spec in check_specs]\n ).items()\n if count > 1\n }\n\n raise DagsterInvalidDefinitionError(f"Duplicate check specs: {duplicates}")\n\n for spec in check_specs_by_output_name.values():\n if spec.asset_key not in valid_asset_keys:\n raise DagsterInvalidDefinitionError(\n f"Invalid asset key {spec.asset_key} in check spec {spec.name}. Must be one of"\n f" {valid_asset_keys}"\n )\n\n return check_specs_by_output_name\n\n\ndef _get_partition_mappings_from_deps(\n partition_mappings: Dict[AssetKey, PartitionMapping], deps: Iterable[AssetDep], asset_name: str\n):\n # Add PartitionMappings specified via AssetDeps to partition_mappings dictionary. Error on duplicates\n for dep in deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" asset {asset_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n return partition_mappings\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_decorator"}, "graph_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Mapping, Optional, Sequence, Union, overload\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    name: Optional[str]\n    description: Optional[str]\n    input_defs: Sequence[InputDefinition]\n    output_defs: Optional[Sequence[OutputDefinition]]\n    ins: Optional[Mapping[str, GraphIn]]\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]]\n    tags: Optional[Mapping[str, str]]\n    config_mapping: Optional[ConfigMapping]\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        ins: Optional[Mapping[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(compose_fn: Callable) -> GraphDefinition: ...\n\n\n@overload\ndef graph(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    ins: Optional[Mapping[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = ...,\n) -> _Graph: ...\n\n\n
[docs]def graph(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n ins: Optional[Mapping[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create an op graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the op graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Graph()(compose_fn)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/graph_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            f"'{fn.__name__}' decorated function does not have required positional "\n            f"parameter '{missing_positional}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'."\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    hook_fn: Callable,\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    hook_fn: Optional[Callable] = None,\n    *,\n    name: Optional[str] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the op\n        (context.op) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = f'{context.op_name} has materialized an asset {event.asset_key}.'\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if hook_fn is not None:\n        check.invariant(required_resource_keys is None)\n        return _Hook()(hook_fn)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(hook_fn: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef success_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def success_hook(\n hook_fn: Optional[SuccessOrFailureHookFn] = None,\n *,\n name: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: SuccessOrFailureHookFn) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if hook_fn is not None:\n check.invariant(required_resource_keys is None)\n return wrapper(hook_fn)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/hook_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Mapping, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..metadata import RawMetadataValue\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n    from ..run_config import RunConfig\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n        ] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        input_values: Optional[Mapping[str, object]] = None,\n    ):\n        from dagster._core.definitions.run_config import convert_config_input\n\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.metadata = metadata\n        self.resource_defs = resource_defs\n        self.config = convert_config_input(config)\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n        self.input_values = input_values\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            metadata=self.metadata,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n            input_values=self.input_values,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(compose_fn: Callable[..., Any]) -> JobDefinition: ...\n\n\n@overload\ndef job(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    config: Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    metadata: Optional[Mapping[str, RawMetadataValue]] = ...,\n    logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n    partitions_def: Optional["PartitionsDefinition"] = ...,\n    input_values: Optional[Mapping[str, object]] = ...,\n) -> _Job: ...\n\n\n
[docs]@deprecated_param(\n param="version_strategy",\n breaking_version="2.0",\n additional_warn_text="Use asset versioning instead.",\n)\ndef job(\n compose_fn: Optional[Callable[..., Any]] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n compose_fn (Callable[..., Any]:\n The decorated function. The body should contain op or graph invocations. Unlike op\n functions, does not accept a context argument.\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping[str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`RunConfig` object is provided, then it will be used directly as the run config\n for the job whenever the job is executed, similar to providing a dictionary.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoization will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(in1):\n return in1 + 1\n\n @job\n def job1():\n add_one(return_one())\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Job()(compose_fn)\n\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return _Job(\n name=name,\n description=description,\n resource_defs=wrap_resources_for_execution(resource_defs),\n config=config,\n tags=tags,\n metadata=metadata,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n input_values=input_values,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/job_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.op_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom inspect import Parameter\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import (\n    format_docstring_for_description,\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom dagster._core.definitions.inference import infer_input_props\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import DagsterTypeKind\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom ..input import In, InputDefinition\nfrom ..output import Out\nfrom ..policy import RetryPolicy\nfrom ..utils import DEFAULT_OUTPUT\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        code_version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Mapping[str, In]] = None,\n        out: Optional[Union[Out, Mapping[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within OpDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.code_version = code_version\n        self.retry_policy = retry_policy\n\n        # config will be checked within OpDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_mapping_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from dagster._config.pythonic_config import validate_resource_annotated_function\n\n        from ..op_definition import OpDefinition\n\n        validate_resource_annotated_function(fn)\n\n        if not self.name:\n            self.name = fn.__name__\n\n        compute_fn = (\n            DecoratedOpFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedOpFunction(decorated_fn=fn)\n        )\n\n        if compute_fn.has_config_arg():\n            check.param_invariant(\n                self.config_schema is None or self.config_schema == {},\n                "If the @op has a config arg, you cannot specify a config schema",\n            )\n\n            from dagster._config.pythonic_config import infer_schema_from_config_annotation\n\n            # Parse schema from the type annotation of the config arg\n            config_arg = compute_fn.get_config_arg()\n            config_arg_type = config_arg.annotation\n            config_arg_default = config_arg.default\n            self.config_schema = infer_schema_from_config_annotation(\n                config_arg_type, config_arg_default\n            )\n\n        outs: Optional[Mapping[str, Out]] = None\n        if self.out is not None and isinstance(self.out, Out):\n            outs = {DEFAULT_OUTPUT: self.out}\n        elif self.out is not None:\n            outs = check.mapping_param(self.out, "out", key_type=str, value_type=Out)\n\n        arg_resource_keys = {arg.name for arg in compute_fn.get_resource_args()}\n        decorator_resource_keys = set(self.required_resource_keys or [])\n        check.param_invariant(\n            len(decorator_resource_keys) == 0 or len(arg_resource_keys) == 0,\n            "Cannot specify resource requirements in both @op decorator and as arguments to the"\n            " decorated function",\n        )\n        resolved_resource_keys = decorator_resource_keys.union(arg_resource_keys)\n\n        op_def = OpDefinition.dagster_internal_init(\n            name=self.name,\n            ins=self.ins,\n            outs=outs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=resolved_resource_keys,\n            tags=self.tags,\n            code_version=self.code_version,\n            retry_policy=self.retry_policy,\n            version=None,  # code_version has replaced version\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\n@overload\ndef op(compute_fn: Callable[..., Any]) -> "OpDefinition": ...\n\n\n@overload\ndef op(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Mapping[str, In]] = ...,\n    out: Optional[Union[Out, Mapping[str, Out]]] = ...,\n    config_schema: Optional[UserConfigSchema] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n) -> _Op: ...\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead"\n)\ndef op(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, In]] = None,\n out: Optional[Union[Out, Mapping[str, Out]]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n) -> Union["OpDefinition", _Op]:\n """Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n code_version (Optional[str]): (Experimental) Version of the logic encapsulated by the op. If set,\n this is used as a default version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n Examples:\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n\n if compute_fn is not None:\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(compute_fn)\n\n return _Op(\n name=name,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n code_version=code_version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n\n\nclass DecoratedOpFunction(NamedTuple):\n """Wrapper around the decorated op function to provide commonly used util methods."""\n\n decorated_fn: Callable[..., Any]\n\n @property\n def name(self):\n return self.decorated_fn.__name__\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return is_context_provided(get_function_params(self.decorated_fn))\n\n def get_context_arg(self) -> Parameter:\n if self.has_context_arg():\n return get_function_params(self.decorated_fn)[0]\n check.failed("Requested context arg on function that does not have one")\n\n @lru_cache(maxsize=1)\n def _get_function_params(self) -> Sequence[Parameter]:\n return get_function_params(self.decorated_fn)\n\n def has_config_arg(self) -> bool:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return True\n\n return False\n\n def get_config_arg(self) -> Parameter:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return param\n\n check.failed("Requested config arg on function that does not have one")\n\n def get_resource_args(self) -> Sequence[Parameter]:\n return get_resource_args(self.decorated_fn)\n\n def positional_inputs(self) -> Sequence[str]:\n params = self._get_function_params()\n input_args = params[1:] if self.has_context_arg() else params\n resource_arg_names = [arg.name for arg in self.get_resource_args()]\n input_args_filtered = [\n input_arg\n for input_arg in input_args\n if input_arg.name != "config" and input_arg.name not in resource_arg_names\n ]\n return positional_arg_name_list(input_args_filtered)\n\n def has_var_kwargs(self) -> bool:\n params = self._get_function_params()\n # var keyword arg has to be the last argument\n return len(params) > 0 and param_is_var_keyword(params[-1])\n\n def get_output_annotation(self) -> Any:\n from ..inference import infer_output_props\n\n return infer_output_props(self.decorated_fn).annotation\n\n\nclass NoContextDecoratedOpFunction(DecoratedOpFunction):\n """Wrapper around a decorated op function, when the decorator does not permit a context\n parameter.\n """\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return False\n\n\ndef is_context_provided(params: Sequence[Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef resolve_checked_op_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedOpFunction,\n explicit_input_defs: Sequence[InputDefinition],\n exclude_nothing: bool,\n) -> Sequence[InputDefinition]:\n """Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedOpFunction): The decorated function, wrapped in the\n DecoratedOpFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n explicit_names = set()\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # filter out config arg\n resource_arg_names = {arg.name for arg in compute_fn.get_resource_args()}\n explicit_names = explicit_names - resource_arg_names\n\n if compute_fn.has_config_arg() or resource_arg_names:\n new_input_args = []\n for input_arg in input_args:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n input_args = new_input_args\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[Parameter], input_args):\n if param.kind == Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter"\n f" '{param.name}' that is one of the input_defs of type 'Nothing' which"\n " should not be included since no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have argument(s)"\n f" '{undeclared_inputs_printed}'. {decorator_name}-decorated functions should have a"\n " keyword argument for each of their Ins, except for Ins that have the Nothing"\n " dagster_type. Alternatively, they can accept **kwargs."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(input_def.combine_with_inferred(inferred_props[input_def.name]))\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n inferred_input_defs = [\n InputDefinition.create_from_inferred(inferred)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n ]\n\n if exclude_nothing:\n for in_def in inferred_input_defs:\n if in_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {in_def.name} is annotated with"\n f" {in_def.dagster_type.display_name} which is a type that represents passing"\n " no data. This type must be used via In() and no parameter should be included"\n f" in the {decorator_name} decorated function."\n )\n\n input_defs.extend(inferred_input_defs)\n\n return input_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/op_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.op_decorator"}, "repository_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.metadata import (\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..asset_checks import AssetChecksDefinition\nfrom ..executor_definition import ExecutorDefinition\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    PendingRepositoryDefinition,\n    PendingRepositoryListDefinition,\n    RepositoryData,\n    RepositoryDefinition,\n    RepositoryListDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\nfrom ..unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nT = TypeVar("T")\n\nRepositoryDictSpec: TypeAlias = Dict[str, Dict[str, RepositoryListDefinition]]\n\n\ndef _flatten(items: Iterable[Union[T, List[T]]]) -> Iterator[T]:\n    for x in items:\n        if isinstance(x, List):\n            # switch to `yield from _flatten(x)` to support multiple layers of nesting\n            yield from x\n        else:\n            yield x\n\n\nclass _Repository:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        metadata: Optional[Dict[str, RawMetadataValue]] = None,\n        default_executor_def: Optional[ExecutorDefinition] = None,\n        default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n        resource_key_mapping: Optional[Mapping[int, str]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.metadata = normalize_metadata(\n            check.opt_mapping_param(metadata, "metadata", key_type=str)\n        )\n        self.default_executor_def = check.opt_inst_param(\n            default_executor_def, "default_executor_def", ExecutorDefinition\n        )\n        self.default_logger_defs = check.opt_mapping_param(\n            default_logger_defs, "default_logger_defs", key_type=str, value_type=LoggerDefinition\n        )\n        self.top_level_resources = check.opt_mapping_param(\n            top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n        )\n        self.resource_key_mapping = check.opt_mapping_param(\n            resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n        )\n\n    @overload\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[RepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> RepositoryDefinition: ...\n\n    @overload\n    def __call__(\n        self, fn: Callable[[], Sequence[PendingRepositoryListDefinition]]\n    ) -> PendingRepositoryDefinition: ...\n\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[PendingRepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n        from dagster._core.definitions import AssetsDefinition, SourceAsset\n        from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Optional[Union[CachingRepositoryData, RepositoryData]]\n        if isinstance(repository_definitions, list):\n            bad_defns = []\n            repository_defns = []\n            defer_repository_data = False\n            for i, definition in enumerate(_flatten(repository_definitions)):\n                if isinstance(definition, CacheableAssetsDefinition):\n                    defer_repository_data = True\n                elif not isinstance(\n                    definition,\n                    (\n                        JobDefinition,\n                        ScheduleDefinition,\n                        UnresolvedPartitionedAssetScheduleDefinition,\n                        SensorDefinition,\n                        GraphDefinition,\n                        AssetsDefinition,\n                        SourceAsset,\n                        UnresolvedAssetJobDefinition,\n                        AssetChecksDefinition,\n                    ),\n                ):\n                    bad_defns.append((i, type(definition)))\n                else:\n                    repository_defns.append(definition)\n\n            if bad_defns:\n                bad_definitions_str = ", ".join(\n                    [f"value of type {type_} at index {i}" for i, type_ in bad_defns]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, "\n                    "ScheduleDefinition, SensorDefinition, "\n                    "AssetsDefinition, SourceAsset, or AssetChecksDefinition."\n                    f"Got {bad_definitions_str}."\n                )\n\n            repository_data = (\n                None\n                if defer_repository_data\n                else CachingRepositoryData.from_list(\n                    repository_defns,\n                    default_executor_def=self.default_executor_def,\n                    default_logger_defs=self.default_logger_defs,\n                    top_level_resources=self.top_level_resources,\n                    resource_key_mapping=self.resource_key_mapping,\n                )\n            )\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'schedules', 'sensors', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                f"'{key}'"\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        if isinstance(repository_definitions, list) and repository_data is None:\n            return PendingRepositoryDefinition(\n                self.name,\n                repository_definitions=list(_flatten(repository_definitions)),\n                description=self.description,\n                metadata=self.metadata,\n                default_executor_def=self.default_executor_def,\n                default_logger_defs=self.default_logger_defs,\n                _top_level_resources=self.top_level_resources,\n            )\n        else:\n            repository_def = RepositoryDefinition(\n                name=self.name,\n                description=self.description,\n                metadata=self.metadata,\n                repository_data=repository_data,\n            )\n\n            update_wrapper(repository_def, fn)\n            return repository_def\n\n\n@overload\ndef repository(\n    definitions_fn: Union[\n        Callable[[], Sequence[RepositoryListDefinition]], Callable[[], RepositoryDictSpec]\n    ],\n) -> RepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    definitions_fn: Callable[..., Sequence[PendingRepositoryListDefinition]]\n) -> PendingRepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    metadata: Optional[Dict[str, RawMetadataValue]] = ...,\n    default_executor_def: Optional[ExecutorDefinition] = ...,\n    default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = ...,\n    _resource_key_mapping: Optional[Mapping[int, str]] = ...,\n) -> _Repository: ...\n\n\n
[docs]def repository(\n definitions_fn: Optional[\n Union[\n Callable[[], Sequence[PendingRepositoryListDefinition]],\n Callable[[], RepositoryDictSpec],\n ]\n ] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load jobs or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[Dict[str, RawMetadataValue]]): Arbitrary metadata for the repository.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n\n Example:\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n # and custom metadata that will be displayed in the UI\n ######################################################################\n\n ...\n\n @repository(\n name='my_repo',\n metadata={\n 'team': 'Team A',\n 'repository_version': '1.2.3',\n 'environment': 'production',\n })\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule': make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_jobs(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n """\n if definitions_fn is not None:\n check.invariant(description is None)\n check.invariant(len(get_function_params(definitions_fn)) == 0)\n\n return _Repository()(definitions_fn)\n\n return _Repository(\n name=name,\n description=description,\n metadata=metadata,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=_top_level_resources,\n resource_key_mapping=_resource_key_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/repository_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.schedule_decorator

\nimport copy\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.sensor_definition import get_context_param_name\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._utils import ensure_gen\n\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import (\n    DecoratedScheduleFunction,\n    DefaultScheduleStatus,\n    RawScheduleEvaluationFunction,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n    has_at_least_one_parameter,\n    validate_and_get_schedule_resource_dict,\n)\nfrom ..target import ExecutableDefinition\nfrom ..utils import validate_tags\n\n\n
[docs]def schedule(\n cron_schedule: Union[str, Sequence[str]],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]] = None,\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawScheduleEvaluationFunction], ScheduleDefinition]:\n """Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n 6. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution time to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n that should execute when this schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def inner(fn: RawScheduleEvaluationFunction) -> ScheduleDefinition:\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n check.callable_param(fn, "fn")\n validate_resource_annotated_function(fn)\n\n schedule_name = name or fn.__name__\n\n validated_tags = None\n\n # perform upfront validation of schedule tags\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n validated_tags = validate_tags(tags, allow_reserved_tags=False)\n\n context_param_name = get_context_param_name(fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n "Error occurred during the execution of should_execute for schedule"\n f" {schedule_name}"\n ),\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n resources = validate_and_get_schedule_resource_dict(\n context.resources, schedule_name, resource_arg_names\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n context_param = {context_param_name: context} if context_param_name else {}\n result = fn(**context_param, **resources)\n\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = (\n validated_tags\n or (tags_fn and validate_tags(tags_fn(context), allow_reserved_tags=False))\n or None\n )\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n elif isinstance(result, list):\n yield from cast(List[RunRequest], result)\n else:\n # this is a run-request based decorated function\n yield from cast(RunRequestIterator, ensure_gen(result))\n\n has_context_arg = has_at_least_one_parameter(fn)\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition.dagster_internal_init(\n name=schedule_name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n run_config=None, # cannot supply run_config or run_config_fn to decorator\n run_config_fn=None,\n tags=None, # cannot supply tags or tags_fn to decorator\n tags_fn=None,\n should_execute=None, # already encompassed in evaluation_fn\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/schedule_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.sensor_decorator

\nimport collections.abc\nimport inspect\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..asset_sensor_definition import AssetSensorDefinition\nfrom ..events import AssetKey\nfrom ..multi_asset_sensor_definition import (\n    AssetMaterializationFunction,\n    MultiAssetMaterializationFunction,\n    MultiAssetSensorDefinition,\n)\nfrom ..run_request import SensorResult\nfrom ..sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunction,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom ..target import ExecutableDefinition\n\n\n
[docs]def sensor(\n job_name: Optional[str] = None,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawSensorEvaluationFunction], SensorDefinition]:\n """Creates a sensor where the decorated function is used as the sensor's evaluation function.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]):\n The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: RawSensorEvaluationFunction) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition.dagster_internal_init(\n name=name,\n job_name=job_name,\n evaluation_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[AssetMaterializationFunction,], AssetSensorDefinition,]:\n """Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n\n Example:\n .. code-block:: python\n\n from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n @asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\n def my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n return RunRequest(\n run_key=context.cursor,\n run_config={\n "ops": {\n "read_materialization": {\n "config": {\n "asset_key": asset_event.dagster_event.asset_key.path,\n }\n }\n }\n },\n )\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: AssetMaterializationFunction) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(*args, **kwargs) -> Any:\n result = fn(*args, **kwargs)\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif isinstance(result, SensorResult):\n if result.cursor:\n raise DagsterInvariantViolationError(\n f"Error in asset sensor {sensor_name}: Sensor returned a SensorResult"\n " with a cursor value. The cursor is managed by the asset sensor and"\n " should not be modified by a user."\n )\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{result} of type {type(result)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n # Preserve any resource arguments from the underlying function, for when we inspect the\n # wrapped function later on\n _wrapped_fn = update_wrapper(_wrapped_fn, wrapped=fn)\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n job_name=job_name,\n asset_materialization_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n return inner
\n\n\n
[docs]@experimental\ndef multi_asset_sensor(\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[MultiAssetMaterializationFunction,], MultiAssetSensorDefinition,]:\n """Creates an asset sensor that can monitor multiple assets.\n\n The decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets this\n sensor monitors. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n if not isinstance(monitored_assets, AssetSelection) and not (\n isinstance(monitored_assets, collections.abc.Sequence)\n and all(isinstance(el, AssetKey) for el in monitored_assets)\n ):\n check.failed(\n "The value passed to monitored_assets param must be either an AssetSelection"\n f" or a Sequence of AssetKeys, but was a {type(monitored_assets)}"\n )\n\n def inner(fn: MultiAssetMaterializationFunction) -> MultiAssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n sensor_def = MultiAssetSensorDefinition(\n name=sensor_name,\n monitored_assets=monitored_assets,\n job_name=job_name,\n asset_materialization_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n request_assets=request_assets,\n required_resource_keys=required_resource_keys,\n )\n update_wrapper(sensor_def, wrapped=fn)\n return sensor_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/sensor_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.sensor_decorator"}}, "definitions_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.definitions_class

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._config.pythonic_config import (\n    attach_resource_id_to_key_mapping,\n)\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.asset_graph import InternalAssetGraph\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._core.execution.with_resources import with_resources\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils.cached_method import cached_method\n\nfrom .assets import AssetsDefinition, SourceAsset\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .decorators import repository\nfrom .job_definition import JobDefinition, default_job_io_manager\nfrom .partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom .repository_definition import (\n    SINGLETON_REPOSITORY_NAME,\n    PendingRepositoryDefinition,\n    RepositoryDefinition,\n)\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n
[docs]@public\n@experimental\ndef create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """Create a named repository using the same arguments as :py:class:`Definitions`. In older\n versions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\n and jobs. There could be many repositories per code location. This was a complicated ontology but\n gave users a way to organize code locations that contained large numbers of heterogenous definitions.\n\n As a stopgap for those who both want to 1) use the new :py:class:`Definitions` API and 2) but still\n want multiple logical groups of assets in the same code location, we have introduced this function.\n\n Example usage:\n\n .. code-block:: python\n\n named_repo = create_repository_using_definitions_args(\n name="a_repo",\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n }\n )\n\n """\n return _create_repository_using_definitions_args(\n name=name,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )
\n\n\nclass _AttachedObjects(NamedTuple):\n jobs: Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]\n schedules: Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n sensors: Iterable[SensorDefinition]\n\n\ndef _io_manager_needs_replacement(job: JobDefinition, resource_defs: Mapping[str, Any]) -> bool:\n """Explicitly replace the default IO manager in jobs that don't specify one, if a top-level\n I/O manager is provided to Definitions.\n """\n return (\n job.resource_defs.get("io_manager") == default_job_io_manager\n and "io_manager" in resource_defs\n )\n\n\ndef _jobs_which_will_have_io_manager_replaced(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n resource_defs: Mapping[str, Any],\n) -> List[Union[JobDefinition, UnresolvedAssetJobDefinition]]:\n """Returns whether any jobs will have their I/O manager replaced by an `io_manager` override from\n the top-level `resource_defs` provided to `Definitions` in 1.3. We will warn users if this is\n the case.\n """\n jobs = jobs or []\n return [\n job\n for job in jobs\n if isinstance(job, JobDefinition) and _io_manager_needs_replacement(job, resource_defs)\n ]\n\n\ndef _attach_resources_to_jobs_and_instigator_jobs(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ],\n sensors: Optional[Iterable[SensorDefinition]],\n resource_defs: Mapping[str, Any],\n) -> _AttachedObjects:\n """Given a list of jobs, schedules, and sensors along with top-level resource definitions,\n attach the resource definitions to the jobs, schedules, and sensors which require them.\n """\n jobs = jobs or []\n schedules = schedules or []\n sensors = sensors or []\n\n # Add jobs in schedules and sensors as well\n jobs = [\n *jobs,\n *[\n schedule.job\n for schedule in schedules\n if isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and isinstance(schedule.job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n *[\n job\n for sensor in sensors\n if sensor.has_loadable_targets()\n for job in sensor.jobs\n if isinstance(job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n ]\n # Dedupe\n jobs = list({id(job): job for job in jobs}.values())\n\n # Find unsatisfied jobs\n unsatisfied_jobs = [\n job\n for job in jobs\n if isinstance(job, JobDefinition)\n and (\n job.is_missing_required_resources() or _io_manager_needs_replacement(job, resource_defs)\n )\n ]\n\n # Create a mapping of job id to a version of the job with the resource defs bound\n unsatisfied_job_to_resource_bound_job = {\n id(job): job.with_top_level_resources(\n {\n **resource_defs,\n **job.resource_defs,\n # special case for IO manager - the job-level IO manager does not take precedence\n # if it is the default and a top-level IO manager is provided\n **(\n {"io_manager": resource_defs["io_manager"]}\n if _io_manager_needs_replacement(job, resource_defs)\n else {}\n ),\n }\n )\n for job in jobs\n if job in unsatisfied_jobs\n }\n\n # Update all jobs to use the resource bound version\n jobs_with_resources = [\n unsatisfied_job_to_resource_bound_job[id(job)] if job in unsatisfied_jobs else job\n for job in jobs\n ]\n\n # Update all schedules and sensors to use the resource bound version\n updated_schedules = [\n (\n schedule.with_updated_job(unsatisfied_job_to_resource_bound_job[id(schedule.job)])\n if (\n isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and schedule.job in unsatisfied_jobs\n )\n else schedule\n )\n for schedule in schedules\n ]\n updated_sensors = [\n (\n sensor.with_updated_jobs(\n [\n (\n unsatisfied_job_to_resource_bound_job[id(job)]\n if job in unsatisfied_jobs\n else job\n )\n for job in sensor.jobs\n ]\n )\n if sensor.has_loadable_targets() and any(job in unsatisfied_jobs for job in sensor.jobs)\n else sensor\n )\n for sensor in sensors\n ]\n\n return _AttachedObjects(jobs_with_resources, updated_schedules, updated_sensors)\n\n\ndef _create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n):\n check.opt_iterable_param(\n assets, "assets", (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)\n )\n check.opt_iterable_param(\n schedules, "schedules", (ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition)\n )\n check.opt_iterable_param(sensors, "sensors", SensorDefinition)\n check.opt_iterable_param(jobs, "jobs", (JobDefinition, UnresolvedAssetJobDefinition))\n\n check.opt_inst_param(executor, "executor", (ExecutorDefinition, Executor))\n executor_def = (\n executor\n if isinstance(executor, ExecutorDefinition) or executor is None\n else ExecutorDefinition.hardcoded_executor(executor)\n )\n\n # Generate a mapping from each top-level resource instance ID to its resource key\n resource_key_mapping = {id(v): k for k, v in resources.items()} if resources else {}\n\n # Provide this mapping to each resource instance so that it can be used to resolve\n # nested resources\n resources_with_key_mapping = (\n {\n k: attach_resource_id_to_key_mapping(v, resource_key_mapping)\n for k, v in resources.items()\n }\n if resources\n else {}\n )\n\n resource_defs = wrap_resources_for_execution(resources_with_key_mapping)\n\n check.opt_mapping_param(loggers, "loggers", key_type=str, value_type=LoggerDefinition)\n\n # Binds top-level resources to jobs and any jobs attached to schedules or sensors\n (\n jobs_with_resources,\n schedules_with_resources,\n sensors_with_resources,\n ) = _attach_resources_to_jobs_and_instigator_jobs(jobs, schedules, sensors, resource_defs)\n\n @repository(\n name=name,\n default_executor_def=executor_def,\n default_logger_defs=loggers,\n _top_level_resources=resource_defs,\n _resource_key_mapping=resource_key_mapping,\n )\n def created_repo():\n return [\n *with_resources(assets or [], resource_defs),\n *with_resources(asset_checks or [], resource_defs),\n *(schedules_with_resources),\n *(sensors_with_resources),\n *(jobs_with_resources),\n ]\n\n return created_repo\n\n\n@deprecated(\n breaking_version="2.0",\n additional_warn_text=(\n "Instantiations can be removed. Since it's behavior is now the default, this class is now a"\n " no-op."\n ),\n)\nclass BindResourcesToJobs(list):\n """Used to instruct Dagster to bind top-level resources to jobs and any jobs attached to schedules\n and sensors. Now deprecated since this behavior is the default.\n """\n\n\n
[docs]class Definitions:\n """A set of definitions explicitly available and loadable by Dagster tools.\n\n Parameters:\n assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]):\n A list of assets. Assets can be created by annotating\n a function with :py:func:`@asset <asset>` or\n :py:func:`@observable_source_asset <observable_source_asset>`.\n Or they can by directly instantiating :py:class:`AssetsDefinition`,\n :py:class:`SourceAsset`, or :py:class:`CacheableAssetsDefinition`.\n\n asset_checks (Optional[Iterable[AssetChecksDefinition]]):\n A list of asset checks.\n\n schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]):\n List of schedules.\n\n sensors (Optional[Iterable[SensorDefinition]]):\n List of sensors, typically created with :py:func:`@sensor <sensor>`.\n\n jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]):\n List of jobs. Typically created with :py:func:`define_asset_job <define_asset_job>`\n or with :py:func:`@job <job>` for jobs defined in terms of ops directly.\n Jobs created with :py:func:`@job <job>` must already have resources bound\n at job creation time. They do not respect the `resources` argument here.\n\n resources (Optional[Mapping[str, Any]]): Dictionary of resources to bind to assets.\n The resources dictionary takes raw Python objects,\n not just instances of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n These resources will be automatically bound\n to any assets passed to this Definitions instance using\n :py:func:`with_resources <with_resources>`. Assets passed to Definitions with\n resources already bound using :py:func:`with_resources <with_resources>` will\n override this dictionary.\n\n executor (Optional[Union[ExecutorDefinition, Executor]]):\n Default executor for jobs. Individual jobs can override this and define their own executors\n by setting the executor on :py:func:`@job <job>` or :py:func:`define_asset_job <define_asset_job>`\n explicitly. This executor will also be used for materializing assets directly\n outside of the context of jobs. If an :py:class:`Executor` is passed, it is coerced into\n an :py:class:`ExecutorDefinition`.\n\n loggers (Optional[Mapping[str, LoggerDefinition]):\n Default loggers for jobs. Individual jobs\n can define their own loggers by setting them explictly.\n\n Example usage:\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n },\n asset_checks=[asset_one_check_one]\n )\n\n Dagster separates user-defined code from system tools such the web server and\n the daemon. Rather than loading code directly into process, a tool such as the\n webserver interacts with user-defined code over a serialization boundary.\n\n These tools must be able to locate and load this code when they start. Via CLI\n arguments or config, they specify a Python module to inspect.\n\n A Python module is loadable by Dagster tools if there is a top-level variable\n that is an instance of :py:class:`Definitions`.\n\n Before the introduction of :py:class:`Definitions`,\n :py:func:`@repository <repository>` was the API for organizing defintions.\n :py:class:`Definitions` provides a few conveniences for dealing with resources\n that do not apply to old-style :py:func:`@repository <repository>` declarations:\n\n * It takes a dictionary of top-level resources which are automatically bound\n (via :py:func:`with_resources <with_resources>`) to any asset passed to it.\n If you need to apply different resources to different assets, use legacy\n :py:func:`@repository <repository>` and use\n :py:func:`with_resources <with_resources>` as before.\n * The resources dictionary takes raw Python objects, not just instances\n of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n """\n\n def __init__(\n self,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n ):\n self._created_pending_or_normal_repo = _create_repository_using_definitions_args(\n name=SINGLETON_REPOSITORY_NAME,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )\n\n
[docs] @public\n def get_job_def(self, name: str) -> JobDefinition:\n """Get a job definition by name. If you passed in a an :py:class:`UnresolvedAssetJobDefinition`\n (return value of :py:func:`define_asset_job`) it will be resolved to a :py:class:`JobDefinition` when returned\n from this function.\n """\n check.str_param(name, "name")\n return self.get_repository_def().get_job(name)
\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_sensor_def(name)
\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_schedule_def(name)
\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.Definitions.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n\n Returns:\n The contents of an asset as a Python object.\n """\n return self.get_repository_def().load_asset_value(\n asset_key=asset_key,\n python_type=python_type,\n instance=instance,\n partition_key=partition_key,\n metadata=metadata,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with defs.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n """\n return self.get_repository_def().get_asset_value_loader(\n instance=instance,\n )
\n\n def get_all_job_defs(self) -> Sequence[JobDefinition]:\n """Get all the Job definitions in the code location."""\n return self.get_repository_def().get_all_jobs()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n return self.get_repository_def().has_implicit_global_asset_job_def()\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method when there is a single defined global asset job.\n This occurs when all assets in the code location use a single partitioning scheme.\n If there are multiple partitioning schemes you must use get_implicit_job_def_for_assets\n instead to access to the correct implicit asset one.\n """\n return self.get_repository_def().get_implicit_global_asset_job_def()\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n return self.get_repository_def().get_implicit_job_def_for_assets(asset_keys)\n\n def get_assets_def(self, key: CoercibleToAssetKey) -> AssetsDefinition:\n asset_key = AssetKey.from_coercible(key)\n for assets_def in self.get_asset_graph().assets:\n if asset_key in assets_def.keys:\n return assets_def\n\n raise DagsterInvariantViolationError(f"Could not find asset {asset_key}")\n\n @cached_method\n def get_repository_def(self) -> RepositoryDefinition:\n """Definitions is implemented by wrapping RepositoryDefinition. Get that underlying object\n in order to access an functionality which is not exposed on Definitions. This method\n also resolves a PendingRepositoryDefinition to a RepositoryDefinition.\n """\n return (\n self._created_pending_or_normal_repo.compute_repository_definition()\n if isinstance(self._created_pending_or_normal_repo, PendingRepositoryDefinition)\n else self._created_pending_or_normal_repo\n )\n\n def get_inner_repository_for_loading_process(\n self,\n ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """This method is used internally to access the inner repository during the loading process\n at CLI entry points. We explicitly do not want to resolve the pending repo because the entire\n point is to defer that resolution until later.\n """\n return self._created_pending_or_normal_repo\n\n def get_asset_graph(self) -> InternalAssetGraph:\n """Get the AssetGraph for this set of definitions."""\n return self.get_repository_def().asset_graph
\n
", "current_page_name": "_modules/dagster/_core/definitions/definitions_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.definitions_class"}, "dependency": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    DefaultDict,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\nfrom dagster._utils import hash_collection\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.op_definition import OpDefinition\n\n    from .asset_layer import AssetLayer\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n    from .resource_requirement import ResourceRequirement\n\nT_DependencyKey = TypeVar("T_DependencyKey", str, "NodeInvocation")\nDependencyMapping: TypeAlias = Mapping[T_DependencyKey, Mapping[str, "IDependencyDefinition"]]\n\n\n
[docs]class NodeInvocation(\n NamedTuple(\n "Node",\n [\n ("name", PublicAttr[str]),\n ("alias", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, Any]]),\n ("hook_defs", PublicAttr[AbstractSet[HookDefinition]]),\n ("retry_policy", PublicAttr[Optional[RetryPolicy]]),\n ],\n )\n):\n """Identifies an instance of a node in a graph dependency structure.\n\n Args:\n name (str): Name of the node of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the node. Necessary when there are\n multiple instances of the same node.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the node definition.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n node instance.\n\n Examples:\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n from dagster import job\n\n @job\n def my_job():\n other_name = some_op.alias('other_name')\n some_graph(other_name(some_op))\n\n """\n\n def __new__(\n cls,\n name: str,\n alias: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n alias=check.opt_str_param(alias, "alias"),\n tags=check.opt_mapping_param(tags, "tags", value_type=str, key_type=str),\n hook_defs=check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition),\n retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n )\n\n # Needs to be hashable because this class is used as a key in dependencies dicts\n def __hash__(self) -> int:\n if not hasattr(self, "_hash"):\n self._hash = hash_collection(self)\n return self._hash
\n\n\nclass Node(ABC):\n """Node invocation within a graph. Identified by its name inside the graph."""\n\n name: str\n definition: "NodeDefinition"\n graph_definition: "GraphDefinition"\n _additional_tags: Mapping[str, str]\n _hook_defs: AbstractSet[HookDefinition]\n _retry_policy: Optional[RetryPolicy]\n _inputs: Mapping[str, "NodeInput"]\n _outputs: Mapping[str, "NodeOutput"]\n\n def __init__(\n self,\n name: str,\n definition: "NodeDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n from .node_definition import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition,\n "graph_definition",\n GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n self._inputs = {\n name: NodeInput(self, input_def)\n for name, input_def in self.definition.input_dict.items()\n }\n self._outputs = {\n name: NodeOutput(self, output_def)\n for name, output_def in self.definition.output_dict.items()\n }\n\n def inputs(self) -> Iterable["NodeInput"]:\n return self._inputs.values()\n\n def outputs(self) -> Iterable["NodeOutput"]:\n return self._outputs.values()\n\n def get_input(self, name: str) -> "NodeInput":\n check.str_param(name, "name")\n return self._inputs[name]\n\n def get_output(self, name: str) -> "NodeOutput":\n check.str_param(name, "name")\n return self._outputs[name]\n\n def has_input(self, name: str) -> bool:\n return self.definition.has_input(name)\n\n def input_def_named(self, name: str) -> InputDefinition:\n return self.definition.input_def_named(name)\n\n def has_output(self, name: str) -> bool:\n return self.definition.has_output(name)\n\n def output_def_named(self, name: str) -> OutputDefinition:\n return self.definition.output_def_named(name)\n\n @property\n def input_dict(self) -> Mapping[str, InputDefinition]:\n return self.definition.input_dict\n\n @property\n def output_dict(self) -> Mapping[str, OutputDefinition]:\n return self.definition.output_dict\n\n @property\n def tags(self) -> Mapping[str, str]:\n return {**self.definition.tags, **self._additional_tags}\n\n def container_maps_input(self, input_name: str) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name: str) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n InputPointer(self.name, input_name)\n )\n if mapping is None:\n check.failed(\n f"container does not map input {input_name}, check container_maps_input first"\n )\n return mapping\n\n def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n if mapping is None:\n check.failed(\n f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n "container_maps_fan_in_input first"\n )\n\n return mapping\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy\n\n @abstractmethod\n def describe_node(self) -> str: ...\n\n @abstractmethod\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]: ...\n\n\nclass GraphNode(Node):\n definition: "GraphDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "GraphDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n\n check.inst_param(definition, "definition", GraphDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for node in self.definition.node_dict.values():\n yield from node.get_resource_requirements(\n asset_layer=asset_layer,\n outer_container=self.definition,\n parent_handle=cur_node_handle,\n )\n\n def describe_node(self) -> str:\n return f"graph '{self.name}'"\n\n\nclass OpNode(Node):\n definition: "OpDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "OpDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .op_definition import OpDefinition\n\n check.inst_param(definition, "definition", OpDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n from .resource_requirement import InputManagerRequirement\n\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for requirement in self.definition.get_resource_requirements(\n (cur_node_handle, asset_layer)\n ):\n # If requirement is a root input manager requirement, but the corresponding node has an upstream output, then ignore the requirement.\n if (\n isinstance(requirement, InputManagerRequirement)\n and outer_container.dependency_structure.has_deps(\n NodeInput(self, self.definition.input_def_named(requirement.input_name))\n )\n and requirement.root_input\n ):\n continue\n yield requirement\n for hook_def in self.hook_defs:\n yield from hook_def.get_resource_requirements(self.describe_node())\n\n def describe_node(self) -> str:\n return f"op '{self.name}'"\n\n\n@whitelist_for_serdes(storage_name="SolidHandle")\nclass NodeHandle(NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])):\n """A structured object to identify nodes in the potentially recursive graph structure."""\n\n def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n return super(NodeHandle, cls).__new__(\n cls,\n check.str_param(name, "name"),\n check.opt_inst_param(parent, "parent", NodeHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def root(self):\n if self.parent:\n return self.parent.root\n else:\n return self\n\n @property\n def path(self) -> Sequence[str]:\n """Return a list representation of the handle.\n\n Inverse of NodeHandle.from_path.\n\n Returns:\n List[str]:\n """\n path: List[str] = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of NodeHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (NodeHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", NodeHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('bar', NodeHandle('foo', None))\n assert handle.pop(ancestor) == NodeHandle('baz', None)\n """\n check.inst_param(ancestor, "ancestor", NodeHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n f"Handle {self.to_string()} does not descend from {ancestor.to_string()}",\n )\n\n return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor: Optional["NodeHandle"]) -> "NodeHandle":\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (NodeHandle): Handle to the new ancestor.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('quux' None)\n assert handle.with_ancestor(ancestor) == NodeHandle(\n 'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n return NodeHandle.from_path([*(ancestor.path if ancestor else []), *self.path])\n\n @staticmethod\n def from_path(path: Sequence[str]) -> "NodeHandle":\n check.sequence_param(path, "path", of_type=str)\n\n cur: Optional["NodeHandle"] = None\n _path = list(path)\n while len(_path) > 0:\n cur = NodeHandle(name=_path.pop(0), parent=cur)\n\n if cur is None:\n check.failed(f"Invalid handle path {path}")\n\n return cur\n\n @staticmethod\n def from_string(handle_str: str) -> "NodeHandle":\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return NodeHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr: Mapping[str, Any]) -> "NodeHandle":\n """This method makes it possible to load a potentially nested NodeHandle after a\n roundtrip through json.loads(json.dumps(NodeHandle._asdict())).\n """\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n parent = NodeHandle.from_dict(\n {\n "name": dict_repr["parent"][0],\n "parent": dict_repr["parent"][1],\n }\n )\n else:\n parent = dict_repr["parent"]\n\n return NodeHandle(name=dict_repr["name"], parent=parent)\n\n\nclass NodeInputHandle(\n NamedTuple("_NodeInputHandle", [("node_handle", NodeHandle), ("input_name", str)])\n):\n """A structured object to uniquely identify inputs in the potentially recursive graph structure."""\n\n\nclass NodeOutputHandle(\n NamedTuple("_NodeOutputHandle", [("node_handle", NodeHandle), ("output_name", str)])\n):\n """A structured object to uniquely identify outputs in the potentially recursive graph structure."""\n\n\nclass NodeInput(NamedTuple("_NodeInput", [("node", Node), ("input_def", InputDefinition)])):\n def __new__(cls, node: Node, input_def: InputDefinition):\n return super(NodeInput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeInput",\n node_name=self.node.name,\n input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.node.name, self.input_def.name))\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, NodeInput)\n and self.node.name == other.node.name\n and self.input_def.name == other.input_def.name\n )\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def input_name(self) -> str:\n return self.input_def.name\n\n\nclass NodeOutput(NamedTuple("_NodeOutput", [("node", Node), ("output_def", OutputDefinition)])):\n def __new__(cls, node: Node, output_def: OutputDefinition):\n return super(NodeOutput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeOutput",\n node_name=self.node.name,\n output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self) -> int:\n return hash((self.node.name, self.output_def.name))\n\n def __eq__(self, other: Any) -> bool:\n return self.node.name == other.node.name and self.output_def.name == other.output_def.name\n\n def describe(self) -> str:\n return f"{self.node_name}:{self.output_def.name}"\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def is_dynamic(self) -> bool:\n return self.output_def.is_dynamic\n\n @property\n def output_name(self) -> str:\n return self.output_def.name\n\n\nclass DependencyType(Enum):\n DIRECT = "DIRECT"\n FAN_IN = "FAN_IN"\n DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC):\n @abstractmethod\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n pass\n\n @abstractmethod\n def is_fan_in(self) -> bool:\n """The result passed to the corresponding input will be a List made from different node outputs."""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(\n cls,\n node: str,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n ):\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return True if the dependency is fan-in (always False for DependencyDefinition)."""\n return False
\n\n def get_op_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [\n (\n "dependencies",\n PublicAttr[Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]],\n )\n ],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.sequence_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.node + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n f'Duplicate dependencies on node "{dep.node}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition."\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed(f"Unexpected dependencies entry {dep}")\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n
[docs] @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]
\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return `True` if the dependency is fan-in (always True for MultiDependencyDefinition)."""\n return True
\n\n
[docs] @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n """Return the combined list of dependencies contained by this object, inculding of :py:class:`DependencyDefinition` and :py:class:`MappedInputPlaceholder` objects."""\n return self.dependencies
\n\n\nclass BlockingAssetChecksDependencyDefinition(\n IDependencyDefinition,\n NamedTuple(\n "_BlockingAssetChecksDependencyDefinition",\n [\n (\n "asset_check_dependencies",\n Sequence[DependencyDefinition],\n ),\n ("other_dependency", Optional[DependencyDefinition]),\n ],\n ),\n):\n """An input that depends on a set of outputs that correspond to upstream asset checks, and also\n optionally depends on a single upstream output that does not correspond to an asset check.\n\n We model this with a different kind of DependencyDefinition than MultiDependencyDefinition,\n because we treat the value that's passed to the input parameter differently: we ignore the asset\n check dependencies and only pass a single value, instead of a fanned-in list.\n """\n\n @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n if self.other_dependency:\n return [*self.asset_check_dependencies, self.other_dependency]\n else:\n return self.asset_check_dependencies\n\n @public\n def is_fan_in(self) -> bool:\n return False\n\n @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n return self.get_node_dependencies()\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("node_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n return [DependencyDefinition(self.node_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputs: TypeAlias = Tuple[\n DependencyType,\n Union[NodeOutput, List[Union[NodeOutput, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputMap: TypeAlias = Dict[NodeInput, DepTypeAndOutputs]\n\n\ndef _create_handle_dict(\n node_dict: Mapping[str, Node],\n dep_dict: DependencyMapping[str],\n) -> InputToOutputMap:\n from .composition import MappedInputPlaceholder\n\n check.mapping_param(node_dict, "node_dict", key_type=str, value_type=Node)\n check.two_dim_mapping_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputMap = {}\n\n for node_name, input_dict in dep_dict.items():\n from_node = node_dict[node_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(\n dep_def, (MultiDependencyDefinition, BlockingAssetChecksDependencyDefinition)\n ):\n handles: List[Union[NodeOutput, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(node_dict[inner_dep.node].get_output(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n f"Unexpected MultiDependencyDefinition dependencies type {inner_dep}"\n )\n\n handle_dict[from_node.get_input(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DIRECT,\n node_dict[dep_def.node].get_output(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n node_dict[dep_def.node_name].get_output(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(\n nodes: Mapping[str, Node], dep_dict: DependencyMapping[str]\n ) -> "DependencyStructure":\n return DependencyStructure(\n list(dep_dict.keys()),\n _create_handle_dict(nodes, dep_dict),\n dep_dict,\n )\n\n _node_input_index: DefaultDict[str, Dict[NodeInput, List[NodeOutput]]]\n _node_output_index: Dict[str, DefaultDict[NodeOutput, List[NodeInput]]]\n _dynamic_fan_out_index: Dict[str, NodeOutput]\n _collect_index: Dict[str, Set[NodeOutput]]\n _deps_by_node_name: DependencyMapping[str]\n\n def __init__(\n self,\n node_names: Sequence[str],\n input_to_output_map: InputToOutputMap,\n deps_by_node_name: DependencyMapping[str],\n ):\n self._node_names = node_names\n self._input_to_output_map = input_to_output_map\n self._deps_by_node_name = deps_by_node_name\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is node\n # count during the GraphQL query in particular\n\n # node_name => input_handle => list[output_handle]\n self._node_input_index = defaultdict(dict)\n\n # node_name => output_handle => list[input_handle]\n self._node_output_index = defaultdict(lambda: defaultdict(list))\n\n # node_name => dynamic output_handle that this node will dupe for\n self._dynamic_fan_out_index = {}\n\n # node_name => set of dynamic output_handle this collects over\n self._collect_index = defaultdict(set)\n\n for node_input, (dep_type, node_output_or_list) in self._input_to_output_map.items():\n if dep_type == DependencyType.FAN_IN:\n node_output_list: List[NodeOutput] = []\n for node_output in node_output_or_list:\n if not isinstance(node_output, NodeOutput):\n continue\n\n if node_output.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on dynamic output"\n f' "{node_output.describe()}".'\n )\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on output"\n f' "{node_output.describe()}", downstream of'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}".'\n )\n\n node_output_list.append(node_output)\n elif dep_type == DependencyType.DIRECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_fan_out(node_input, node_output)\n\n if self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_fan_out(\n node_input, self._dynamic_fan_out_index[node_output.node_name]\n )\n\n node_output_list = [node_output]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_collect(node_input, node_output)\n\n elif self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_collect(\n node_input,\n self._dynamic_fan_out_index[node_output.node_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {node_output} -> {node_input}"\n )\n\n node_output_list = [node_output]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._node_input_index[node_input.node.name][node_input] = node_output_list\n for node_output in node_output_list:\n self._node_output_index[node_output.node.name][node_output].append(node_input)\n\n def _validate_and_set_fan_out(self, node_input: NodeInput, node_output: NodeOutput) -> None:\n """Helper function for populating _dynamic_fan_out_index."""\n if not node_input.node.definition.input_supports_dynamic_output_dep(node_input.input_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of dynamic output"\n f' "{node_output.describe()}" since input "{node_input.input_name}" maps to a'\n " node that is already downstream of another dynamic output. Nodes cannot be"\n " downstream of more than one dynamic output"\n )\n\n if self._collect_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be both downstream of dynamic output "\n f"{node_output.describe()} and collect over dynamic output "\n f"{next(iter(self._collect_index[node_input.node_name])).describe()}."\n )\n\n if self._dynamic_fan_out_index.get(node_input.node_name) is None:\n self._dynamic_fan_out_index[node_input.node_name] = node_output\n return\n\n if self._dynamic_fan_out_index[node_input.node_name] != node_output:\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_input.node_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n node_input: NodeInput,\n node_output: NodeOutput,\n ) -> None:\n if self._dynamic_fan_out_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot both collect over dynamic output "\n f"{node_output.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[node_input.node_name].describe()}."\n )\n\n self._collect_index[node_input.node_name].add(node_output)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}"'\n )\n\n def all_upstream_outputs_from_node(self, node_name: str) -> Sequence[NodeOutput]:\n check.str_param(node_name, "node_name")\n\n # flatten out all outputs that feed into the inputs of this node\n return [\n output_handle\n for output_handle_list in self._node_input_index[node_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeInput, Sequence[NodeOutput]]:\n """Returns a Dict[NodeInput, List[NodeOutput]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[NodeOutput] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(node_name, "node_name")\n return self._node_input_index[node_name]\n\n def output_to_downstream_inputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeOutput, Sequence[NodeInput]]:\n """Returns a Dict[NodeOutput, List[NodeInput]] that\n represents all the downstream inputs for each output in the\n dictionary.\n """\n check.str_param(node_name, "node_name")\n return self._node_output_index[node_name]\n\n def has_direct_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def get_dependency_definition(self, node_input: NodeInput) -> Optional[IDependencyDefinition]:\n return self._deps_by_node_name[node_input.node_name].get(node_input.input_name)\n\n def has_fan_in_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, node_input: NodeInput\n ) -> Sequence[Union[NodeOutput, Type["MappedInputPlaceholder"]]]:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, deps = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[NodeOutput, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def has_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n return node_input in self._input_to_output_map\n\n def get_deps_list(self, node_input: NodeInput) -> Sequence[NodeOutput]:\n check.inst_param(node_input, "node_input", NodeInput)\n check.invariant(self.has_deps(node_input))\n dep_type, handle_or_list = self._input_to_output_map[node_input]\n if dep_type == DependencyType.DIRECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, NodeOutput)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def inputs(self) -> Sequence[NodeInput]:\n return list(self._input_to_output_map.keys())\n\n def get_upstream_dynamic_output_for_node(self, node_name: str) -> Optional[NodeOutput]:\n return self._dynamic_fan_out_index.get(node_name)\n\n def get_dependency_type(self, node_input: NodeInput) -> Optional[DependencyType]:\n result = self._input_to_output_map.get(node_input)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, node_name: str) -> bool:\n return node_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, node_name: str) -> bool:\n for node_output in self._dynamic_fan_out_index.values():\n if node_output.node_name == node_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/definitions/dependency", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.dependency"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.events

\nimport re\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, experimental_param, public\nfrom dagster._core.definitions.data_version import DATA_VERSION_TAG, DataVersion\nfrom dagster._core.storage.tags import MULTIDIMENSIONAL_PARTITION_PREFIX, SYSTEM_TAG_PREFIX\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\n\nfrom .metadata import (\n    MetadataFieldSerializer,\n    MetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n    from dagster._core.execution.context.output import OutputContext\n\n\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_DELIMITER = "/"\n\n\ndef parse_asset_key_string(s: str) -> Sequence[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", PublicAttr[Sequence[str]])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return f"AssetKey({self.path})"\n\n def __repr__(self):\n return f"AssetKey({self.path})"\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n if len(self.path) != len(other.path):\n return False\n for i in range(0, len(self.path)):\n if self.path[i] != other.path[i]:\n return False\n return True\n\n def to_string(self) -> str:\n """E.g. '["first_component", "second_component"]'."""\n return seven.json.dumps(self.path)\n\n def to_user_string(self) -> str:\n """E.g. "first_component/second_component"."""\n return ASSET_KEY_DELIMITER.join(self.path)\n\n def to_python_identifier(self, suffix: Optional[str] = None) -> str:\n """Build a valid Python identifier based on the asset key that can be used for\n operation names or I/O manager keys.\n """\n path = list(self.path)\n\n if suffix is not None:\n path.append(suffix)\n\n return "__".join(path).replace("-", "_")\n\n @staticmethod\n def from_user_string(asset_key_string: str) -> "AssetKey":\n return AssetKey(asset_key_string.split(ASSET_KEY_DELIMITER))\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: Sequence[str]):\n check.sequence_param(path, "path", of_type=str)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(graphql_input_asset_key: Mapping[str, Sequence[str]]) -> "AssetKey":\n return AssetKey(graphql_input_asset_key["path"])\n\n def to_graphql_input(self) -> Mapping[str, Sequence[str]]:\n return {"path": self.path}\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetKey") -> "AssetKey":\n if isinstance(arg, AssetKey):\n return check.inst_param(arg, "arg", AssetKey)\n elif isinstance(arg, str):\n return AssetKey([arg])\n elif isinstance(arg, list):\n check.list_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n elif isinstance(arg, tuple):\n check.tuple_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n else:\n check.failed(f"Unexpected type for AssetKey: {type(arg)}")\n\n @staticmethod\n def from_coercible_or_definition(\n arg: Union["CoercibleToAssetKey", "AssetsDefinition", "SourceAsset"]\n ) -> "AssetKey":\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n if isinstance(arg, AssetsDefinition):\n return arg.key\n elif isinstance(arg, SourceAsset):\n return arg.key\n else:\n return AssetKey.from_coercible(arg)\n\n # @staticmethod\n # def from_coercible_to_asset_dep(arg: "CoercibleToAssetDep") -> "AssetKey":\n # from dagster._core.definitions.asset_dep import AssetDep\n # from dagster._core.definitions.asset_spec import AssetSpec\n # from dagster._core.definitions.assets import AssetsDefinition\n # from dagster._core.definitions.source_asset import SourceAsset\n\n # if isinstance(arg, AssetsDefinition):\n # if len(arg.keys) > 1:\n # # Only AssetsDefinition with a single asset can be passed\n # raise DagsterInvalidDefinitionError(\n # "Cannot pass a multi_asset AssetsDefinition as an argument to deps."\n # " Instead, specify dependencies on the assets created by the multi_asset"\n # f" via AssetKeys or strings. For the multi_asset {arg.node_def.name}, the"\n # f" available keys are: {arg.keys}."\n # )\n # return arg.key\n # elif isinstance(arg, SourceAsset):\n # return arg.key\n # elif isinstance(arg, AssetDep):\n # return arg.asset_key\n # elif isinstance(arg, AssetSpec):\n # return arg.asset_key\n # else:\n # return AssetKey.from_coercible(arg)\n\n def has_prefix(self, prefix: Sequence[str]) -> bool:\n return len(self.path) >= len(prefix) and self.path[: len(prefix)] == prefix\n\n def with_prefix(self, prefix: "CoercibleToAssetKeyPrefix") -> "AssetKey":\n prefix = key_prefix_from_coercible(prefix)\n return AssetKey(list(prefix) + list(self.path))
\n\n\nclass AssetKeyPartitionKey(NamedTuple):\n """An AssetKey with an (optional) partition key. Refers either to a non-partitioned asset or a\n partition of a partitioned asset.\n """\n\n asset_key: AssetKey\n partition_key: Optional[str] = None\n\n\nCoercibleToAssetKey = Union[AssetKey, str, Sequence[str]]\nCoercibleToAssetKeyPrefix = Union[str, Sequence[str]]\n\n\ndef check_opt_coercible_to_asset_key_prefix_param(\n prefix: Optional[CoercibleToAssetKeyPrefix], param_name: str\n) -> Optional[Sequence[str]]:\n try:\n return key_prefix_from_coercible(prefix) if prefix is not None else None\n except check.CheckError:\n raise check.ParameterCheckError(\n f'Param "{param_name}" is not a string or a sequence of strings'\n )\n\n\ndef key_prefix_from_coercible(key_prefix: CoercibleToAssetKeyPrefix) -> Sequence[str]:\n if isinstance(key_prefix, str):\n return [key_prefix]\n elif isinstance(key_prefix, list):\n return key_prefix\n else:\n check.failed(f"Unexpected type for key_prefix: {type(key_prefix)}")\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key: AssetKey, partitions: Optional[AbstractSet[str]] = None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\nT = TypeVar("T")\n\n\n
[docs]@experimental_param(param="data_version")\nclass Output(Generic[T]):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n data_version (Optional[DataVersion]): (Experimental) A data version to manually set\n for the asset.\n """\n\n def __init__(\n self,\n value: T,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n self._value = value\n self._output_name = check.str_param(output_name, "output_name")\n self._data_version = check.opt_inst_param(data_version, "data_version", DataVersion)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n return self._metadata\n\n @public\n @property\n def value(self) -> Any:\n """Any: The value returned by the compute function."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """str: Name of the corresponding :py:class:`Out`."""\n return self._output_name\n\n @public\n @property\n def data_version(self) -> Optional[DataVersion]:\n """Optional[DataVersion]: A data version that was manually set on the `Output`."""\n return self._data_version\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, Output)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.metadata == other.metadata\n )
\n\n\n
[docs]class DynamicOutput(Generic[T]):\n """Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n self._mapping_key = check_valid_name(check.str_param(mapping_key, "mapping_key"))\n self._output_name = check.str_param(output_name, "output_name")\n self._value = value\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> str:\n """The mapping_key that was set for this DynamicOutput at instantiation."""\n return self._mapping_key\n\n @public\n @property\n def value(self) -> T:\n """The value that is returned by the compute function for this DynamicOut."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """Name of the :py:class:`DynamicOut` defined on the op that this DynamicOut is associated with."""\n return self._output_name\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DynamicOutput)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.mapping_key == other.mapping_key\n and self.metadata == other.metadata\n )
\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n observation. Users should not pass values into this argument.\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n if any([not tag.startswith(SYSTEM_TAG_PREFIX) for tag in tags or {}]):\n check.failed(\n "Users should not pass values into the tags argument for AssetMaterializations. "\n "The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n @property\n def data_version(self) -> Optional[str]:\n return self.tags.get(DATA_VERSION_TAG)\n\n\nUNDEFINED_ASSET_KEY_PATH = ["__undefined__"]\n\n\nclass AssetMaterializationSerializer(NamedTupleSerializer):\n # There are old `Materialization` objects in storage. We set the default value for asset key to\n # be `AssetKey(["__undefined__"])` to ensure that we can load these objects, without needing to\n # allow for the construction of new `AssetMaterialization` objects with no defined AssetKey.\n def before_unpack(self, context, unpacked_dict: Any) -> Any:\n # cover both the case where "asset_key" is not present at all and where it is None\n if unpacked_dict.get("asset_key") is None:\n unpacked_dict["asset_key"] = AssetKey(UNDEFINED_ASSET_KEY_PATH)\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n old_storage_names={"Materialization"},\n serializer=AssetMaterializationSerializer,\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", Optional[Mapping[str, str]]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in the Dagster UI.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across\n job runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n partition (Optional[str]): The name of the partition\n that was materialized.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n materialization. Users should not pass values into this argument.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionKey\n\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n invalid_tags = [tag for tag in tags or {} if not tag.startswith(SYSTEM_TAG_PREFIX)]\n if len(invalid_tags) > 0:\n check.failed(\n f"Invalid tags: {tags} Users should not pass values into the tags argument for"\n " AssetMaterializations. The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n partition = check.opt_str_param(partition, "partition")\n\n if not isinstance(partition, MultiPartitionKey):\n # When event log records are unpacked from storage, cast the partition key as a\n # MultiPartitionKey if multi-dimensional partition tags exist\n multi_dimensional_partitions = {\n dimension[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]: partition_key\n for dimension, partition_key in (tags or {}).items()\n if dimension.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX)\n }\n if multi_dimensional_partitions:\n partition = MultiPartitionKey(multi_dimensional_partitions)\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=partition,\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @public\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, Sequence[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata={"path": MetadataValue.path(path)},\n )
\n\n\n
[docs]@deprecated(\n breaking_version="1.7",\n additional_warn_text="Please use AssetCheckResult and @asset_check instead.",\n)\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", PublicAttr[bool]),\n ("label", PublicAttr[Optional[str]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\n@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", PublicAttr[bool]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Op compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n allow_retries (Optional[bool]):\n Whether this Failure should respect the retry policy or bypass it and immediately fail.\n Defaults to True, respecting the retry policy and allowing retries.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n allow_retries: Optional[bool] = None,\n ):\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n self.allow_retries = check.opt_bool_param(allow_retries, "allow_retries", True)
\n\n\n
[docs]class RetryRequested(Exception):\n """An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/_core/definitions/events", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, Sequence, Union, overload\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import Int\nfrom dagster._config import Field, Noneable, Selector, UserConfigSchema\nfrom dagster._core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.executor.base import Executor\n    from dagster._core.executor.in_process import InProcessExecutor\n    from dagster._core.executor.init import InitExecutorContext\n    from dagster._core.executor.multiprocess import MultiprocessExecutor\n    from dagster._core.instance import DagsterInstance\n\n\nclass ExecutorRequirement(PyEnum):\n    """An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job execution.\n    """\n\n    # The passed in IJob must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = (  # This needs to still exist for folks who may have written their own executor\n        "RECONSTRUCTABLE_PIPELINE"\n    )\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any op outputs on the job must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements() -> Sequence[ExecutorRequirement]:\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\nExecutorConfig = Mapping[str, object]\nExecutorCreationFunction: TypeAlias = Callable[["InitExecutorContext"], "Executor"]\nExecutorRequirementsFunction: TypeAlias = Callable[[ExecutorConfig], Sequence[ExecutorRequirement]]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """An executor is responsible for executing the steps of a job.\n\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n description (Optional[str]): A description of the executor.\n """\n\n def __init__(\n self,\n name: str,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Union[\n ExecutorRequirementsFunction, Optional[Sequence[ExecutorRequirement]]\n ] = None,\n executor_creation_fn: Optional[ExecutorCreationFunction] = None,\n description: Optional[str] = None,\n ):\n self._name = check.str_param(name, "name")\n self._requirements_fn: ExecutorRequirementsFunction\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @public\n @property\n def name(self) -> str:\n """Name of the executor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Description of executor, if provided."""\n return self._description\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n def get_requirements(\n self, executor_config: Mapping[str, object]\n ) -> Sequence[ExecutorRequirement]:\n return self._requirements_fn(executor_config)\n\n @public\n @property\n def executor_creation_fn(self) -> Optional[ExecutorCreationFunction]:\n """Callable that takes an :py:class:`InitExecutorContext` and returns an instance of\n :py:class:`Executor`.\n """\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema) -> "ExecutorDefinition":\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema, # type: ignore\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n @staticmethod\n def hardcoded_executor(executor: "Executor"):\n return ExecutorDefinition(\n # Executor name was only relevant in the pipeline/solid/mode world, so we\n # can put a dummy value\n name="__executor__",\n executor_creation_fn=lambda _init_context: executor,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] @public\n def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n description: Optional[str] = None,\n ) -> Self:\n """Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Using ``configured`` may result in config values being displayed in\n the Dagster UI, so it is not recommended to use this API with sensitive values,\n such as secrets.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(name or self.name, description, new_config_schema)
\n\n\n@overload\ndef executor(name: ExecutorCreationFunction) -> ExecutorDefinition: ...\n\n\n@overload\ndef executor(\n name: Optional[str] = ...,\n config_schema: Optional[UserConfigSchema] = ...,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = ...,\n) -> "_ExecutorDecoratorCallable": ...\n\n\n
[docs]def executor(\n name: Union[ExecutorCreationFunction, Optional[str]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = None,\n) -> Union[ExecutorDefinition, "_ExecutorDecoratorCallable"]:\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn: ExecutorCreationFunction) -> ExecutorDefinition:\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(executor_def, wrapped=fn) # type: ignore\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: ExecutorConfig) -> "InProcessExecutor":\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore # (possible none)\n marker_to_close=config.get("marker_to_close"), # type: ignore # (should be str)\n )\n\n\nIN_PROC_CONFIG = Field(\n {\n "retries": get_retries_config(),\n "marker_to_close": Field(\n str,\n is_required=False,\n description="[DEPRECATED]",\n ),\n },\n description="Execute all steps in a single process.",\n)\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n To select it, include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_) -> "InProcessExecutor":\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: ExecutorConfig) -> "MultiprocessExecutor":\n from dagster._core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg: Dict[str, object] = {}\n start_selector = check.opt_dict_elem(config, "start_method")\n if start_selector:\n start_method, start_cfg = next(iter(start_selector.items()))\n\n return MultiprocessExecutor(\n max_concurrent=check.opt_int_elem(config, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(config, "tag_concurrency_limits"),\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore\n start_method=start_method,\n explicit_forkserver_preload=check.opt_list_elem(start_cfg, "preload_modules", of_type=str),\n )\n\n\nMULTI_PROC_CONFIG = Field(\n {\n "max_concurrent": Field(\n Noneable(Int),\n default_value=None,\n description=(\n "The number of processes that may run concurrently. "\n "By default, this is set to be the return value of `multiprocessing.cpu_count()`."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n "start_method": Field(\n Selector(\n fields={\n "spawn": Field(\n {},\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `spawn`."\n ),\n ),\n "forkserver": Field(\n {\n "preload_modules": Field(\n [str],\n is_required=False,\n description=(\n "Explicitly specify the modules to preload in the forkserver."\n " Otherwise, there are two cases for default values if modules"\n " are not specified. If the Dagster job was loaded from a"\n " module, the same module will be preloaded. If not, the"\n " `dagster` module is preloaded."\n ),\n ),\n },\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `forkserver`."\n ),\n ),\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. By default, `spawn` is selected. See "\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods."\n ),\n ),\n "retries": get_retries_config(),\n },\n description="Execute each step in an individual process.",\n)\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n To configure the multiprocess executor, include a fragment such as the following in your run\n config:\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be None or 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndef check_cross_process_constraints(init_context: "InitExecutorContext") -> None:\n from dagster._core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_job(init_context.job)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_job(job: IJob) -> None:\n if not isinstance(job, ReconstructableJob):\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with the job"\n f' "{job.get_definition().name}" that is not reconstructable. Job must be loaded in a'\n " way that allows dagster to reconstruct them in a new process. This means: \\n *"\n " using the file, module, or workspace.yaml arguments of"\n " dagster-webserver/dagster-graphql/dagster\\n * loading the job through the"\n " reconstructable() function\\n"\n )\n\n\ndef _check_non_ephemeral_instance(instance: "DagsterInstance") -> None:\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an ephemeral"\n " DagsterInstance. A non-ephemeral instance is needed to coordinate execution between"\n " multiple processes. You can configure your default instance via $DAGSTER_HOME or"\n " ensure a valid one is passed when invoking the python APIs. You can learn more about"\n " setting up a persistent DagsterInstance from the DagsterInstance docs here:"\n " https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(\n executor_config: ExecutorConfig,\n) -> Sequence[ExecutorRequirement]:\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n
[docs]@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context: "InitExecutorContext") -> "Executor":\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(\n check.dict_elem(init_context.executor_config, "multiprocess")\n )\n else:\n return _core_in_process_executor_creation(\n check.dict_elem(init_context.executor_config, "in_process")\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/executor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.executor_definition"}, "freshness_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy

\nimport datetime\nfrom typing import AbstractSet, NamedTuple, Optional\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.schedules import (\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom .events import AssetKey\n\n\nclass FreshnessConstraint(NamedTuple):\n    asset_keys: AbstractSet[AssetKey]\n    required_data_time: datetime.datetime\n    required_by_time: datetime.datetime\n\n\nclass FreshnessMinutes(NamedTuple):\n    overdue_minutes: float\n    lag_minutes: float\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass FreshnessPolicy(\n NamedTuple(\n "_FreshnessPolicy",\n [\n ("maximum_lag_minutes", float),\n ("cron_schedule", Optional[str]),\n ("cron_schedule_timezone", Optional[str]),\n ],\n )\n):\n """A FreshnessPolicy specifies how up-to-date you want a given asset to be.\n\n Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\n that you expect to be incorporated into the current state of that asset at certain points in time.\n How this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n (other partitioning schemes are not supported).\n\n For time-partitioned assets, the current data time for the asset is simple to calculate. The\n upstream data that is incorporated into the asset is exactly the set of materialized partitions\n for that asset. Thus, the current data time for the asset is simply the time up to which all\n partitions have been materialized.\n\n For unpartitioned assets, the current data time is based on the upstream materialization records\n that were read to generate the current state of the asset. More specifically,\n imagine you have two assets, where A depends on B. If `B` has a FreshnessPolicy defined, this\n means that at time T, the most recent materialization of `B` should have come after a\n materialization of `A` which was no more than `maximum_lag_minutes` ago. This calculation is\n recursive: any given asset is expected to incorporate up-to-date data from all of its upstream\n assets.\n\n It is assumed that all asset definitions with no upstream asset definitions consume from some\n always-updating source. That is, if you materialize that asset at time T, it will incorporate\n all data up to time T.\n\n If `cron_schedule` is not defined, the given asset will be expected to incorporate upstream\n data from no more than `maximum_lag_minutes` ago at all points in time. For example, "The events\n table should always have data from at most 1 hour ago".\n\n If `cron_schedule` is defined, the given asset will be expected to incorporate upstream data\n from no more than `maximum_lag_minutes` ago at each cron schedule tick. For example, "By 9AM,\n the signups table should contain all of yesterday's data".\n\n The freshness status of assets with policies defined will be visible in the UI. If you are using\n an asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\n date with respect to their FreshnessPolicy.\n\n Args:\n maximum_lag_minutes (float): An upper bound for how old the data contained within this\n asset may be.\n cron_schedule (Optional[str]): A cron schedule string (e.g. ``"0 1 * * *"``) specifying a\n series of times by which the `maximum_lag_minutes` constraint must be satisfied. If\n no cron schedule is provided, then this constraint must be satisfied at all times.\n cron_schedule_timezone (Optional[str]): Timezone in which the cron schedule should be evaluated.\n If not specified, defaults to UTC. Supported strings for timezones are the ones provided\n by the `IANA time zone database <https://www.iana.org/time-zones>` - e.g.\n "America/Los_Angeles".\n\n .. code-block:: python\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def fresh_asset():\n ...\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def cron_up_to_date_asset():\n ...\n\n """\n\n def __new__(\n cls,\n *,\n maximum_lag_minutes: float,\n cron_schedule: Optional[str] = None,\n cron_schedule_timezone: Optional[str] = None,\n ):\n if cron_schedule is not None:\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(f"Invalid cron schedule '{cron_schedule}'.")\n check.param_invariant(\n is_valid_cron_schedule(cron_schedule),\n "cron_schedule",\n f"Invalid cron schedule '{cron_schedule}'.",\n )\n if cron_schedule_timezone is not None:\n check.param_invariant(\n cron_schedule is not None,\n "cron_schedule_timezone",\n "Cannot specify cron_schedule_timezone without a cron_schedule.",\n )\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(cron_schedule_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n "Invalid cron schedule timezone '{cron_schedule_timezone}'. "\n ) from e\n return super(FreshnessPolicy, cls).__new__(\n cls,\n maximum_lag_minutes=float(\n check.numeric_param(maximum_lag_minutes, "maximum_lag_minutes")\n ),\n cron_schedule=check.opt_str_param(cron_schedule, "cron_schedule"),\n cron_schedule_timezone=check.opt_str_param(\n cron_schedule_timezone, "cron_schedule_timezone"\n ),\n )\n\n @classmethod\n def _create(cls, *args):\n """Pickle requires a method with positional arguments to construct\n instances of a class. Since the constructor for this class has\n keyword arguments only, we define this method to be used by pickle.\n """\n return cls(maximum_lag_minutes=args[0], cron_schedule=args[1])\n\n def __reduce__(self):\n return (self._create, (self.maximum_lag_minutes, self.cron_schedule))\n\n @property\n def maximum_lag_delta(self) -> datetime.timedelta:\n return datetime.timedelta(minutes=self.maximum_lag_minutes)\n\n def get_evaluation_tick(\n self,\n evaluation_time: datetime.datetime,\n ) -> Optional[datetime.datetime]:\n if self.cron_schedule:\n # most recent cron schedule tick\n schedule_ticks = reverse_cron_string_iterator(\n end_timestamp=evaluation_time.timestamp(),\n cron_string=self.cron_schedule,\n execution_timezone=self.cron_schedule_timezone,\n )\n return next(schedule_ticks)\n else:\n return evaluation_time\n\n def minutes_overdue(\n self,\n data_time: Optional[datetime.datetime],\n evaluation_time: datetime.datetime,\n ) -> Optional[FreshnessMinutes]:\n """Returns a number of minutes past the specified freshness policy that this asset currently\n is. If the asset is missing upstream data, or is not materialized at all, then it is unknown\n how overdue it is, and this will return None.\n\n Args:\n data_time (Optional[datetime]): The timestamp of the data that was used to create the\n current version of this asset.\n evaluation_time (datetime): The time at which we're evaluating the overdueness of this\n asset. Generally, this is the current time.\n """\n if data_time is None:\n return None\n evaluation_tick = self.get_evaluation_tick(evaluation_time)\n if evaluation_tick is None:\n return None\n required_time = evaluation_tick - self.maximum_lag_delta\n\n return FreshnessMinutes(\n lag_minutes=max(0.0, (evaluation_tick - data_time).total_seconds() / 60),\n overdue_minutes=max(0.0, (required_time - data_time).total_seconds() / 60),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy"}, "freshness_policy_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy_sensor_definition

\nfrom typing import Callable, Dict, Mapping, NamedTuple, Optional, Set, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    FreshnessPolicySensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\n\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\n\n\n@whitelist_for_serdes\nclass FreshnessPolicySensorCursor(\n    NamedTuple(\n        "_FreshnessPolicySensorCursor",\n        [("minutes_late_by_key_str", Mapping[str, Optional[float]])],\n    )\n):\n    def __new__(cls, minutes_late_by_key_str: Mapping[str, Optional[float]]):\n        return super(FreshnessPolicySensorCursor, cls).__new__(\n            cls,\n            minutes_late_by_key_str=check.mapping_param(\n                minutes_late_by_key_str, "minutes_late_by_key_str", key_type=str\n            ),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            deserialize_value(json_str, FreshnessPolicySensorCursor)\n            return True\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    @staticmethod\n    def from_dict(\n        minutes_late_by_key: Mapping[AssetKey, Optional[float]]\n    ) -> "FreshnessPolicySensorCursor":\n        return FreshnessPolicySensorCursor(\n            minutes_late_by_key_str={k.to_user_string(): v for k, v in minutes_late_by_key.items()}\n        )\n\n    @property\n    def minutes_late_by_key(self) -> Mapping[AssetKey, Optional[float]]:\n        return {AssetKey.from_user_string(k): v for k, v in self.minutes_late_by_key_str.items()}\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "FreshnessPolicySensorCursor":\n        return deserialize_value(json_str, FreshnessPolicySensorCursor)\n\n\n
[docs]class FreshnessPolicySensorContext(\n NamedTuple(\n "_FreshnessPolicySensorContext",\n [\n ("sensor_name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("freshness_policy", PublicAttr[FreshnessPolicy]),\n ("minutes_overdue", PublicAttr[Optional[float]]),\n ("previous_minutes_overdue", PublicAttr[Optional[float]]),\n ("instance", PublicAttr[DagsterInstance]),\n ("resources", Resources),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``freshness_policy_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n asset_key (AssetKey): the key of the asset being monitored\n freshness_policy (FreshnessPolicy): the freshness policy of the asset being monitored\n minutes_overdue (Optional[float])\n previous_minutes_overdue (Optional[float]): the minutes_overdue value for this asset on the\n previous sensor tick.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(\n cls,\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float],\n instance: DagsterInstance,\n resources: Optional[Resources] = None,\n ):\n minutes_overdue = check.opt_numeric_param(minutes_overdue, "minutes_overdue")\n previous_minutes_overdue = check.opt_numeric_param(\n previous_minutes_overdue, "previous_minutes_overdue"\n )\n return super(FreshnessPolicySensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n freshness_policy=check.inst_param(freshness_policy, "FreshnessPolicy", FreshnessPolicy),\n minutes_overdue=float(minutes_overdue) if minutes_overdue is not None else None,\n previous_minutes_overdue=(\n float(previous_minutes_overdue) if previous_minutes_overdue is not None else None\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n resources=resources or ScopedResourcesBuilder.build_empty(),\n )
\n\n\n
[docs]@experimental\ndef build_freshness_policy_sensor_context(\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float] = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Resources] = None,\n) -> FreshnessPolicySensorContext:\n """Builds freshness policy sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@freshness_policy_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n asset_key (AssetKey): The AssetKey for the monitored asset\n freshness_policy (FreshnessPolicy): The FreshnessPolicy for the monitored asset\n minutes_overdue (Optional[float]): How overdue the monitored asset currently is\n previous_minutes_overdue (Optional[float]): How overdue the monitored asset was on the\n previous tick.\n instance (DagsterInstance): The dagster instance configured for the context.\n\n Examples:\n .. code-block:: python\n\n context = build_freshness_policy_sensor_context(\n sensor_name="freshness_policy_sensor_to_invoke",\n asset_key=AssetKey("some_asset"),\n freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n minutes_overdue=10.0,\n )\n freshness_policy_sensor_to_invoke(context)\n """\n return FreshnessPolicySensorContext(\n sensor_name=sensor_name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_overdue,\n previous_minutes_overdue=previous_minutes_overdue,\n instance=instance or DagsterInstance.ephemeral(),\n resources=resources,\n )
\n\n\n
[docs]class FreshnessPolicySensorDefinition(SensorDefinition):\n """Define a sensor that reacts to the status of a given set of asset freshness policies,\n where the decorated function will be evaluated on every sensor tick.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_selection: AssetSelection,\n freshness_policy_sensor_fn: Callable[..., None],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n check.str_param(name, "name")\n check.inst_param(asset_selection, "asset_selection", AssetSelection)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._freshness_policy_sensor_fn = check.callable_param(\n freshness_policy_sensor_fn, "freshness_policy_sensor_fn"\n )\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(freshness_policy_sensor_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrapped_fn(context: SensorEvaluationContext):\n from dagster._utils.caching_instance_queryer import (\n CachingInstanceQueryer, # expensive import\n )\n\n if context.repository_def is None:\n raise DagsterInvalidInvocationError(\n "The `repository_def` property on the `SensorEvaluationContext` passed into a "\n "`FreshnessPolicySensorDefinition` must not be None."\n )\n\n if context.cursor is None or not FreshnessPolicySensorCursor.is_valid(context.cursor):\n new_cursor = FreshnessPolicySensorCursor({})\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initializing {name}.")\n return\n\n evaluation_time = pendulum.now("UTC")\n asset_graph = context.repository_def.asset_graph\n instance_queryer = CachingInstanceQueryer(\n context.instance, asset_graph, evaluation_time\n )\n data_time_resolver = CachingDataTimeResolver(instance_queryer=instance_queryer)\n monitored_keys = asset_selection.resolve(asset_graph)\n\n # get the previous status from the cursor\n previous_minutes_late_by_key = FreshnessPolicySensorCursor.from_json(\n context.cursor\n ).minutes_late_by_key\n\n minutes_late_by_key: Dict[AssetKey, Optional[float]] = {}\n for asset_key in monitored_keys:\n freshness_policy = asset_graph.freshness_policies_by_key.get(asset_key)\n if freshness_policy is None:\n continue\n\n # get the current minutes_overdue value for this asset\n result = data_time_resolver.get_minutes_overdue(\n evaluation_time=evaluation_time,\n asset_key=asset_key,\n )\n minutes_late_by_key[asset_key] = result.overdue_minutes if result else None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n context_param_name = get_context_param_name(freshness_policy_sensor_fn)\n freshness_context = FreshnessPolicySensorContext(\n sensor_name=name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_late_by_key[asset_key],\n previous_minutes_overdue=previous_minutes_late_by_key.get(asset_key),\n instance=context.instance,\n resources=context.resources,\n )\n\n with user_code_error_boundary(\n FreshnessPolicySensorExecutionError,\n lambda: f'Error occurred during the execution of sensor "{name}".',\n ):\n context_param = (\n {context_param_name: freshness_context} if context_param_name else {}\n )\n result = freshness_policy_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is not None:\n raise DagsterInvalidDefinitionError(\n "Functions decorated by `@freshness_policy_sensor` may not return or yield"\n " a value."\n )\n\n context.update_cursor(\n FreshnessPolicySensorCursor.from_dict(minutes_late_by_key).to_json()\n )\n\n super(FreshnessPolicySensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> None:\n context_param_name = get_context_param_name(self._freshness_policy_sensor_fn)\n\n sensor_context = get_sensor_context_from_args_or_kwargs(\n self._freshness_policy_sensor_fn,\n args,\n kwargs,\n context_type=FreshnessPolicySensorContext,\n )\n context_param = (\n {context_param_name: sensor_context} if context_param_name and sensor_context else {}\n )\n\n resources = validate_and_get_resource_dict(\n sensor_context.resources if sensor_context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n return self._freshness_policy_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.FRESHNESS_POLICY
\n\n\n
[docs]@experimental\ndef freshness_policy_sensor(\n asset_selection: AssetSelection,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[Callable[..., None]], FreshnessPolicySensorDefinition,]:\n """Define a sensor that reacts to the status of a given set of asset freshness policies, where the\n decorated function will be evaluated on every tick for each asset in the selection that has a\n FreshnessPolicy defined.\n\n Note: returning or yielding a value from the annotated function will result in an error.\n\n Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n\n Args:\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def inner(fn: Callable[..., None]) -> FreshnessPolicySensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n return FreshnessPolicySensorDefinition(\n name=sensor_name,\n freshness_policy_sensor_fn=fn,\n asset_selection=asset_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy_sensor_definition"}, "graph_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.graph_definition

\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.selector.subset_selector import AssetSelectionData\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\n\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    GraphNode,\n    Node,\n    NodeHandle,\n    NodeInput,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import RawMetadataValue\nfrom .node_container import create_execution_structure, normalize_dependency_dict\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .resource_requirement import ResourceRequirement\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.instance import DagsterInstance\n\n    from .asset_layer import AssetLayer\n    from .composition import PendingNodeInvocation\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .op_definition import OpDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .run_config import RunConfig\n    from .source_asset import SourceAsset\n\nT = TypeVar("T")\n\n\ndef _check_node_defs_arg(\n    graph_name: str, node_defs: Optional[Sequence[NodeDefinition]]\n) -> Sequence[NodeDefinition]:\n    node_defs = node_defs or []\n\n    _node_defs = check.opt_sequence_param(node_defs, "node_defs")\n    for node_def in _node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(name=graph_name, func=node_def.__name__)\n            )\n        else:\n            raise DagsterInvalidDefinitionError(f"Invalid item in node list: {node_def!r}")\n\n    return node_defs\n\n\ndef create_adjacency_lists(\n    nodes: Sequence[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Mapping[str, Set[str]], Mapping[str, Set[str]]]:\n    visit_dict = {s.name: False for s in nodes}\n    forward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n    backward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n\n    def visit(node_name: str) -> None:\n        if visit_dict[node_name]:\n            return\n\n        visit_dict[node_name] = True\n\n        for node_output in dep_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = node_output.node.name\n            backward_node = node_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in nodes:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster op graph.\n\n An op graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the job.\n node_defs (Optional[Sequence[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n _node_defs: Sequence[NodeDefinition]\n _dagster_type_dict: Mapping[str, DagsterType]\n _dependencies: DependencyMapping[NodeInvocation]\n _dependency_structure: DependencyStructure\n _node_dict: Mapping[str, Node]\n _input_mappings: Sequence[InputMapping]\n _output_mappings: Sequence[OutputMapping]\n _config_mapping: Optional[ConfigMapping]\n _nodes_in_topological_order: Sequence[Node]\n\n # (node name within the graph -> (input name -> SourceAsset to load that input from))\n # Does NOT include keys for:\n # - Inputs to the graph itself\n # - Inputs to nodes within sub-graphs of the graph\n _node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]]\n\n def __init__(\n self,\n name: str,\n *,\n description: Optional[str] = None,\n node_defs: Optional[Sequence[NodeDefinition]] = None,\n dependencies: Optional[\n Union[DependencyMapping[str], DependencyMapping[NodeInvocation]]\n ] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n **kwargs: Any,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n\n # `dependencies` will be converted to `dependency_structure` and `node_dict`, which may\n # alternatively be passed directly (useful when copying)\n self._dependencies = normalize_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # Sequence[InputMapping]\n self._input_mappings = check.opt_sequence_param(input_mappings, "input_mappings")\n input_defs = _validate_in_mappings(\n self._input_mappings,\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n # Sequence[OutputMapping]\n self._output_mappings, output_defs = _validate_out_mappings(\n check.opt_sequence_param(output_mappings, "output_mappings"),\n self._node_dict,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self._nodes_in_topological_order = self._get_nodes_in_topological_order()\n self._dagster_type_dict = construct_dagster_type_dictionary([self])\n self._node_input_source_assets = check.opt_mapping_param(\n node_input_source_assets, "node_input_source_assets", key_type=str, value_type=dict\n )\n\n def _get_nodes_in_topological_order(self) -> Sequence[Node]:\n _forward_edges, backward_edges = create_adjacency_lists(\n self.nodes, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.node_named(node_name) for node_name in order]\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n unresolveable_input_defs: List[InputDefinition] = []\n for node in self.node_dict.values():\n cur_handle = NodeHandle(node.name, handle)\n for input_def in node.definition.get_inputs_must_be_resolved_top_level(\n asset_layer, cur_handle\n ):\n if self.dependency_structure.has_deps(NodeInput(node, input_def)):\n continue\n elif not node.container_maps_input(input_def.name):\n raise DagsterInvalidDefinitionError(\n f"Input '{input_def.name}' of {node.describe_node()} "\n "has no way of being resolved. Must provide a resolution to this "\n "input via another op/graph, or via a direct input value mapped from the "\n "top-level graph. To "\n "learn more, see the docs for unconnected inputs: "\n "https://docs.dagster.io/concepts/io-management/unconnected-inputs#unconnected-inputs."\n )\n else:\n mapped_input = node.container_mapped_input(input_def.name)\n unresolveable_input_defs.append(mapped_input.get_definition())\n return unresolveable_input_defs\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def nodes(self) -> Sequence[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Mapping[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> Sequence[NodeDefinition]:\n return self._node_defs\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._nodes_in_topological_order\n\n @property\n def node_input_source_assets(self) -> Mapping[str, Mapping[str, "SourceAsset"]]:\n return self._node_input_source_assets\n\n def has_node_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def node_named(self, name: str) -> Node:\n check.str_param(name, "name")\n if name not in self._node_dict:\n raise DagsterInvariantViolationError(f"{self._name} has no op named {name}.")\n\n return self._node_dict[name]\n\n def get_node(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage: List[str] = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n node = self.node_named(name)\n while lineage:\n name = lineage.pop()\n # We know that this is a current node is a graph while ascending lineage\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n\n return node\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_op_defs()\n\n def iterate_node_handles(\n self, parent_node_handle: Optional[NodeHandle] = None\n ) -> Iterator[NodeHandle]:\n for node in self.node_dict.values():\n cur_node_handle = NodeHandle(node.name, parent_node_handle)\n if isinstance(node, GraphNode):\n yield from node.definition.iterate_node_handles(cur_node_handle)\n yield cur_node_handle\n\n @public\n @property\n def input_mappings(self) -> Sequence[InputMapping]:\n """Input mappings for the graph.\n\n An input mapping is a mapping from an input of the graph to an input of a child node.\n """\n return self._input_mappings\n\n @public\n @property\n def output_mappings(self) -> Sequence[OutputMapping]:\n """Output mappings for the graph.\n\n An output mapping is a mapping from an output of the graph to an output of a child node.\n """\n return self._output_mappings\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the graph, if present.\n\n By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.\n """\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name: str) -> DagsterType:\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.graph_input_name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.graph_output_name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: Optional[NodeHandle]\n ) -> Tuple[OutputDefinition, Optional[NodeHandle]]:\n check.str_param(output_name, "output_name")\n check.opt_inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_node = self.node_named(mapping.maps_from.node_name)\n return mapped_node.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, handle),\n )\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n return self.node_named(\n mapping.maps_from.node_name\n ).definition.resolve_output_to_origin_op_def(output_name)\n\n def default_value_for_input(self, input_name: str) -> object:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.node_name\n # check if input mapped to node which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to node which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.node_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n ) -> Self:\n return GraphDefinition(\n node_defs=self.node_defs,\n dependencies=self.dependencies,\n name=name or self.name,\n description=description or self.description,\n input_mappings=input_mappings or self._input_mappings,\n output_mappings=output_mappings or self._output_mappings,\n config=config or self.config_mapping,\n tags=tags or self.tags,\n node_input_source_assets=node_input_source_assets or self.node_input_source_assets,\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n ) -> "GraphDefinition":\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n f'"{self.name}" does not have a config mapping, and thus has nothing to be '\n "configured."\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return self.copy(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self) -> Sequence[str]:\n return list(self._node_dict.keys())\n\n
[docs] @public\n def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union["RunConfig", ConfigMapping, Mapping[str, object], "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, str]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[Sequence[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> "JobDefinition":\n """Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping [str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Mapping[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n asset_layer (Optional[AssetLayer]): Top level information about the assets this job\n will produce. Generally should not be set manually.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Returns:\n JobDefinition\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .job_definition import JobDefinition\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs)\n\n return JobDefinition.dagster_internal_init(\n name=name,\n description=description or self.description,\n graph_def=self,\n resource_defs=wrapped_resource_defs,\n logger_defs=logger_defs,\n executor_def=executor_def,\n config=config,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n asset_layer=asset_layer,\n input_values=input_values,\n _subset_selection_data=_asset_selection_data,\n _was_explicitly_provided_resources=None, # None means this is determined by whether resource_defs contains any explicitly provided resources\n ).get_subset(op_selection=op_selection)
\n\n def coerce_to_job(self) -> "JobDefinition":\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Mapping[str, object]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Mapping[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the graph.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n resource_defs = wrap_resources_for_execution(resources)\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self,\n executor_def=execute_in_process_executor,\n resource_defs=resource_defs,\n input_values=input_values,\n ).get_subset(op_selection=op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n\n return ephemeral_job.execute_in_process(\n run_config=run_config,\n instance=instance,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False\n\n def get_resource_requirements(\n self, asset_layer: Optional["AssetLayer"] = None\n ) -> Iterator[ResourceRequirement]:\n for node in self.node_dict.values():\n yield from node.get_resource_requirements(outer_container=self, asset_layer=asset_layer)\n\n for dagster_type in self.all_dagster_types():\n yield from dagster_type.get_resource_requirements()\n\n @public\n @property\n def name(self) -> str:\n """The name of the graph."""\n return super(GraphDefinition, self).name\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """The tags associated with the graph."""\n return super(GraphDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Aliases the graph with a new name.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.alias("my_graph_alias")\n """\n return super(GraphDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Attaches the provided tags to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.tag({"my_tag": "my_value"})\n """\n return super(GraphDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Attaches the provided hooks to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_hooks({my_hook})\n """\n return super(GraphDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Attaches the provided retry policy to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n """\n return super(GraphDefinition, self).with_retry_policy(retry_policy)
\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n all_destinations: List[NodeInputHandle] = []\n for mapping in self.input_mappings:\n if mapping.graph_input_name != input_handle.input_name:\n continue\n # recurse into graph structure\n all_destinations += self.node_named(\n mapping.maps_to.node_name\n ).definition.resolve_input_to_destinations(\n NodeInputHandle(\n NodeHandle(mapping.maps_to.node_name, parent=input_handle.node_handle),\n mapping.maps_to.input_name,\n ),\n )\n\n return all_destinations
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[Sequence[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Mapping[Union[str, NodeInvocation], Mapping[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[Sequence[NodeDefinition]],\n dependencies: Optional[\n Union[\n DependencyMapping[str],\n DependencyMapping[NodeInvocation],\n ]\n ],\n input_mappings: Optional[Sequence[InputMapping]],\n output_mappings: Optional[Sequence[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> Sequence[Node]:\n return [node for node in self.parent_graph_def.nodes if not self.has_node_named(node.name)]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: Sequence[InputMapping],\n nodes_by_name: Mapping[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Sequence[InputDefinition]:\n from .composition import MappedInputPlaceholder\n\n input_defs_by_name: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys: Set[str] = set()\n\n target_input_types_by_graph_input_name: Dict[str, Set[DagsterType]] = defaultdict(set)\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' you passed an InputDefinition "\n f"named '{mapping.name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' received unexpected type '{type(mapping)}' in"\n " input_mappings. Provide an InputMapping using InputMapping(...)"\n )\n\n input_defs_by_name[mapping.graph_input_name] = mapping.get_definition()\n\n target_node = nodes_by_name.get(mapping.maps_to.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping references node "\n f"'{mapping.maps_to.node_name}' which it does not contain."\n )\n if not target_node.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping to node '{mapping.maps_to.node_name}' "\n f"which contains no input named '{mapping.maps_to.input_name}'"\n )\n\n target_input_def = target_node.input_def_named(mapping.maps_to.input_name)\n node_input = NodeInput(target_node, target_input_def)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target"\n f' "{maps_to.node_name}.{maps_to.input_name}" (index'\n f" {maps_to.fan_in_index} of fan-in) is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(node_input)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.node_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n "the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.node_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type.get_inner_type_for_fan_in()\n )\n else:\n if dependency_structure.has_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output"\n )\n\n mapping_keys.add(f"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type\n )\n\n for node_input in dependency_structure.inputs():\n if dependency_structure.has_fan_in_deps(node_input):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(node_input)):\n if dep is MappedInputPlaceholder:\n mapping_str = f"{node_input.node_name}.{node_input.input_name}.{idx}"\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n f"Unsatisfied MappedInputPlaceholder at index {idx} in"\n " MultiDependencyDefinition for"\n f" '{node_input.node_name}.{node_input.input_name}'"\n )\n\n # if the dagster type on a graph input is Any and all its target inputs have the\n # same dagster type, then use that dagster type for the graph input\n for graph_input_name, graph_input_def in input_defs_by_name.items():\n if graph_input_def.dagster_type.kind == DagsterTypeKind.ANY:\n target_input_types = target_input_types_by_graph_input_name[graph_input_name]\n if len(target_input_types) == 1:\n input_defs_by_name[graph_input_name] = graph_input_def.with_dagster_type(\n next(iter(target_input_types))\n )\n\n return list(input_defs_by_name.values())\n\n\ndef _validate_out_mappings(\n output_mappings: Sequence[OutputMapping],\n node_dict: Mapping[str, Node],\n name: str,\n class_name: str,\n) -> Tuple[Sequence[OutputMapping], Sequence[OutputDefinition]]:\n output_defs: List[OutputDefinition] = []\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n target_node = node_dict.get(mapping.maps_from.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output mapping references node "\n f"'{mapping.maps_from.node_name}' which it does not contain."\n )\n if not target_node.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} {name} output mapping from {target_node.describe_node()} "\n f"which contains no output named '{mapping.maps_from.output_name}'"\n )\n\n target_output = target_node.output_def_named(mapping.maps_from.output_name)\n output_def = mapping.get_definition(is_dynamic=target_output.is_dynamic)\n output_defs.append(output_def)\n\n if (\n mapping.dagster_type\n and mapping.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output '{mapping.graph_output_name}' of type"\n f" {mapping.dagster_type.display_name} maps from"\n f" {mapping.maps_from.node_name}.{mapping.maps_from.output_name} of different"\n f" type {target_output.dagster_type.display_name}. OutputMapping source and"\n " destination must have the same type."\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n f"You passed an OutputDefinition named '{mapping.name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"Received unexpected type '{type(mapping)}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)"\n )\n return output_mappings, output_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/graph_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.hook_definition

\nfrom typing import AbstractSet, Any, Callable, Iterator, NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .resource_requirement import HookResourceRequirement, RequiresResources, ResourceRequirement\nfrom .utils import check_valid_name\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", PublicAttr[str]),\n ("hook_fn", PublicAttr[Callable]),\n ("required_resource_keys", PublicAttr[AbstractSet[str]]),\n ("decorated_fn", PublicAttr[Optional[Callable]]),\n ],\n ),\n RequiresResources,\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n *,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.opt_callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - JobDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .job_definition import JobDefinition\n\n if len(args) > 0 and isinstance(args[0], (JobDefinition, GraphDefinition)):\n # when it decorates a job, we apply this hook to all the op invocations within\n # the job.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n # outer_context in this case is a string of (job, job name) or (node, node name)\n attached_to = cast(Optional[str], outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield HookResourceRequirement(\n key=resource_key, attached_to=attached_to, hook_name=self.name\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/hook_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.input

\nimport inspect\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param, experimental_param\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (  # BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nT = TypeVar("T")\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name: str, dagster_type: DagsterType, default_value: T) -> T:\n    from dagster._core.types.dagster_type import BuiltinScalarDagsterType\n\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    "Type check failed for the default_value of InputDefinition "\n                    f"{input_name} of type {dagster_type.display_name}. "\n                    f"Received value {default_value} of type {type(default_value)}",\n                )\n\n    return default_value\n\n\n@experimental_param(param="asset_key")\n@experimental_param(param="asset_partitions")\nclass InputDefinition:\n    """Defines an argument to an op's compute function.\n\n    Inputs may flow from previous op outputs, or be stubbed using config. They may optionally\n    be typed using the Dagster type system.\n\n    Args:\n        name (str): Name of the input.\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n            Users should provide the Python type of the objects that they expect to be passed for\n            this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n            to be run on this input. Defaults to :py:class:`Any`.\n        description (Optional[str]): Human-readable description of the input.\n        default_value (Optional[Any]): The default value to use if no input is provided.\n        metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n        asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n            (or function that produces an AssetKey from the InputContext) which should be associated\n            with this InputDefinition. Used for tracking lineage information through Dagster.\n        asset_partitions (Optional[Union[AbstractSet[str], InputContext -> AbstractSet[str]]]): (Experimental) A\n            set of partitions of the given asset_key (or a function that produces this list of\n            partitions from the InputContext) which should be associated with this InputDefinition.\n        input_manager_key (Optional[str]): (Experimental) The resource key for the\n            :py:class:`InputManager` used for loading this input when it is not connected to an\n            upstream output.\n    """\n\n    _name: str\n    _type_not_set: bool\n    _dagster_type: DagsterType\n    _description: Optional[str]\n    _default_value: Any\n    _input_manager_key: Optional[str]\n    _raw_metadata: ArbitraryMetadataMapping\n    _metadata: Mapping[str, MetadataValue]\n    _asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]\n    _asset_partitions_fn: Optional[Callable[["InputContext"], Set[str]]]\n\n    def __init__(\n        self,\n        name: str,\n        dagster_type: object = None,\n        description: Optional[str] = None,\n        default_value: object = NoValueSentinel,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n        asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n        input_manager_key: Optional[str] = None,\n        # when adding new params, make sure to update combine_with_inferred and with_dagster_type below\n    ):\n        self._name = check_valid_name(name, allow_list=["config"])\n\n        self._type_not_set = dagster_type is None\n        self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n        self._description = check.opt_str_param(description, "description")\n\n        self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n        self._input_manager_key = check.opt_str_param(input_manager_key, "input_manager_key")\n\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n        if not callable(asset_key):\n            check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n        self._asset_key = asset_key\n\n        if asset_partitions:\n            check.param_invariant(\n                asset_key is not None,\n                "asset_partitions",\n                'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n            )\n        if callable(asset_partitions):\n            self._asset_partitions_fn = asset_partitions\n        elif asset_partitions is not None:\n            _asset_partitions = check.set_param(asset_partitions, "asset_partitions", of_type=str)\n            self._asset_partitions_fn = lambda _: _asset_partitions\n        else:\n            self._asset_partitions_fn = None\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def has_default_value(self) -> bool:\n        return self._default_value is not NoValueSentinel\n\n    @property\n    def default_value(self) -> Any:\n        check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n        return self._default_value\n\n    @property\n    def input_manager_key(self) -> Optional[str]:\n        return self._input_manager_key\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_asset(self) -> bool:\n        return self._asset_key is not None\n\n    @property\n    def hardcoded_asset_key(self) -> Optional[AssetKey]:\n        if not callable(self._asset_key):\n            return self._asset_key\n        else:\n            return None\n\n    def get_asset_key(self, context: "InputContext") -> Optional[AssetKey]:\n        """Get the AssetKey associated with this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if callable(self._asset_key):\n            return self._asset_key(context)\n        else:\n            return self.hardcoded_asset_key\n\n    def get_asset_partitions(self, context: "InputContext") -> Optional[Set[str]]:\n        """Get the set of partitions that this op will read from this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if self._asset_partitions_fn is None:\n            return None\n\n        return self._asset_partitions_fn(context)\n\n    def mapping_to(\n        self, node_name: str, input_name: str, fan_in_index: Optional[int] = None\n    ) -> "InputMapping":\n        """Create an input mapping to an input of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`InputMapping` to the input of a child node.\n\n        Args:\n            node_name (str): The name of the child node to which to map this input.\n            input_name (str): The name of the child node' input to which to map this input.\n            fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n        Examples:\n            .. code-block:: python\n\n                input_mapping = InputDefinition('composite_input', Int).mapping_to(\n                    'child_node', 'int_input'\n                )\n        """\n        check.str_param(node_name, "node_name")\n        check.str_param(input_name, "input_name")\n        check.opt_int_param(fan_in_index, "fan_in_index")\n\n        return InputMapping(\n            graph_input_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_input_name=input_name,\n            fan_in_index=fan_in_index,\n            graph_input_description=self.description,\n            dagster_type=self.dagster_type,\n        )\n\n    @staticmethod\n    def create_from_inferred(inferred: InferredInputProps) -> "InputDefinition":\n        return InputDefinition(\n            name=inferred.name,\n            dagster_type=_checked_inferred_type(inferred),\n            description=inferred.description,\n            default_value=inferred.default_value,\n        )\n\n    def combine_with_inferred(self, inferred: InferredInputProps) -> "InputDefinition":\n        """Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n        This can update: dagster_type, description, and default_value if they are not set.\n        """\n        check.invariant(\n            self.name == inferred.name,\n            f"InferredInputProps name {inferred.name} did not align with InputDefinition name"\n            f" {self.name}",\n        )\n\n        dagster_type = self._dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred)\n\n        description = self._description\n        if description is None and inferred.description is not None:\n            description = inferred.description\n\n        default_value = self._default_value\n        if not self.has_default_value:\n            default_value = inferred.default_value\n\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            default_value=default_value,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n    def with_dagster_type(self, dagster_type: DagsterType) -> "InputDefinition":\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=self.description,\n            default_value=self.default_value if self.has_default_value else NoValueSentinel,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n\ndef _checked_inferred_type(inferred: InferredInputProps) -> DagsterType:\n    try:\n        if inferred.annotation == inspect.Parameter.empty:\n            resolved_type = resolve_dagster_type(None)\n        elif inferred.annotation is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            resolved_type = resolve_dagster_type(type(None))\n        else:\n            resolved_type = resolve_dagster_type(inferred.annotation)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred.annotation}' from type annotation for argument "\n            f"'{inferred.name}', correct the issue or explicitly set the dagster_type "\n            "via In()."\n        ) from e\n\n    return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("node_name", str), ("input_name", str)])):\n    def __new__(cls, node_name: str, input_name: str):\n        return super(InputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n        )\n\n\nclass FanInInputPointer(\n    NamedTuple(\n        "_FanInInputPointer", [("node_name", str), ("input_name", str), ("fan_in_index", int)]\n    )\n):\n    def __new__(cls, node_name: str, input_name: str, fan_in_index: int):\n        return super(FanInInputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n            check.int_param(fan_in_index, "fan_in_index"),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the upstream op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass InputMapping(NamedTuple):\n """Defines an input mapping for a graph.\n\n Args:\n graph_input_name (str): Name of the input in the graph being mapped from.\n mapped_node_name (str): Named of the node (op/graph) that the input is being mapped to.\n mapped_node_input_name (str): Name of the input in the node (op/graph) that is being mapped to.\n fan_in_index (Optional[int]): The index in to a fanned input, otherwise None.\n graph_input_description (Optional[str]): A description of the input in the graph being mapped from.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's input\n being mapped from.\n\n Examples:\n .. code-block:: python\n\n from dagster import InputMapping, GraphDefinition, op, graph\n\n @op\n def needs_input(x):\n return x + 1\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[needs_input],\n input_mappings=[\n InputMapping(\n graph_input_name="maps_x", mapped_node_name="needs_input",\n mapped_node_input_name="x"\n )\n ]\n )\n\n @graph\n def the_graph(maps_x):\n needs_input(maps_x)\n """\n\n graph_input_name: str\n mapped_node_name: str\n mapped_node_input_name: str\n fan_in_index: Optional[int] = None\n graph_input_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n\n @property\n def maps_to(self) -> Union[InputPointer, FanInInputPointer]:\n if self.fan_in_index is not None:\n return FanInInputPointer(\n self.mapped_node_name, self.mapped_node_input_name, self.fan_in_index\n )\n return InputPointer(self.mapped_node_name, self.mapped_node_input_name)\n\n @property\n def maps_to_fan_in(self) -> bool:\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.graph_input_name} -> {self.maps_to.node_name}:{self.maps_to.input_name}{idx}"\n\n def get_definition(self) -> "InputDefinition":\n return InputDefinition(\n name=self.graph_input_name,\n description=self.graph_input_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("default_value", PublicAttr[Any]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n (\n "asset_key",\n PublicAttr[Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]],\n ),\n (\n "asset_partitions",\n PublicAttr[Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]],\n ),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n metadata (Optional[Dict[str, RawMetadataValue]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n input_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`InputManager` used for loading this input when it is not connected to an\n upstream output.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n input_manager_key: Optional[str] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=asset_partitions,\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition) -> "In":\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # noqa: SLF001\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # noqa: SLF001\n asset_partitions=input_def._asset_partitions_fn, # noqa: SLF001\n input_manager_key=input_def.input_manager_key,\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n input_manager_key=self.input_manager_key,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", PublicAttr[Optional[str]])])):\n """Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.job_definition

\nimport importlib\nimport os\nimport warnings\nfrom datetime import datetime\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental_param, public\nfrom dagster._config import Field, Shape, StringSource\nfrom dagster._config.config_type import ConfigType\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.dependency import (\n    Node,\n    NodeHandle,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.op_selection import OpSelection, get_graph_subset\nfrom dagster._core.definitions.partition import DynamicPartitionsDefinition\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceRequirement,\n    ensure_requirements_satisfied,\n)\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.selector.subset_selector import (\n    AssetSelectionData,\n    OpSelectionData,\n)\nfrom dagster._core.storage.io_manager import (\n    IOManagerDefinition,\n    dagster_maintained_io_manager,\n    io_manager,\n)\nfrom dagster._core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._core.utils import str_format_set\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\n\nfrom .asset_layer import AssetLayer, build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    OpNode,\n)\nfrom .executor_definition import ExecutorDefinition, multi_or_in_process_executor\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import MetadataValue, RawMetadataValue, normalize_metadata\nfrom .partition import PartitionedConfig, PartitionsDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .utils import DEFAULT_IO_MANAGER_KEY, validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._config.snap import ConfigSchemaSnapshot\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.execution.resources_init import InitResourceContext\n    from dagster._core.host_representation.job_index import JobIndex\n    from dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n    from dagster._core.snap import JobSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\nDEFAULT_EXECUTOR_DEF = multi_or_in_process_executor\n\n\n
[docs]@experimental_param(param="version_strategy")\nclass JobDefinition(IHasInternalInit):\n """Defines a Dagster job."""\n\n _name: str\n _graph_def: GraphDefinition\n _description: Optional[str]\n _tags: Mapping[str, str]\n _metadata: Mapping[str, MetadataValue]\n _current_level_node_defs: Sequence[NodeDefinition]\n _hook_defs: AbstractSet[HookDefinition]\n _op_retry_policy: Optional[RetryPolicy]\n _asset_layer: AssetLayer\n _resource_requirements: Mapping[str, AbstractSet[str]]\n _all_node_defs: Mapping[str, NodeDefinition]\n _cached_run_config_schemas: Dict[str, "RunConfigSchema"]\n _version_strategy: VersionStrategy\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]]\n input_values: Mapping[str, object]\n\n def __init__(\n self,\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n name: Optional[str] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]] = None,\n asset_layer: Optional[AssetLayer] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _was_explicitly_provided_resources: Optional[bool] = None,\n ):\n from dagster._core.definitions.run_config import RunConfig, convert_config_input\n\n self._graph_def = graph_def\n self._current_level_node_defs = self._graph_def.node_defs\n # Recursively explore all nodes in the this job\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._asset_layer = check.opt_inst_param(\n asset_layer, "asset_layer", AssetLayer\n ) or _infer_asset_layer_from_source_asset_deps(graph_def)\n\n # validates\n self._graph_def.get_inputs_must_be_resolved_top_level(self._asset_layer)\n\n self._name = check_valid_name(check.str_param(name, "name")) if name else graph_def.name\n self._executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n self._loggers = check.opt_nullable_mapping_param(\n logger_defs,\n "logger_defs",\n key_type=str,\n value_type=LoggerDefinition,\n )\n\n config = check.opt_inst_param(\n config, "config", (Mapping, ConfigMapping, PartitionedConfig, RunConfig)\n )\n config = convert_config_input(config)\n\n partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs")\n self._op_retry_policy = check.opt_inst_param(\n op_retry_policy, "op_retry_policy", RetryPolicy\n )\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n _subset_selection_data = check.opt_inst_param(\n _subset_selection_data, "_subset_selection_data", (OpSelectionData, AssetSelectionData)\n )\n input_values = check.opt_mapping_param(input_values, "input_values", key_type=str)\n\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n for key in resource_defs.keys():\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n was_provided_resources = (\n bool(resource_defs)\n if _was_explicitly_provided_resources is None\n else _was_explicitly_provided_resources\n )\n self._resource_defs = {\n DEFAULT_IO_MANAGER_KEY: default_job_io_manager,\n **resource_defs,\n }\n self._required_resource_keys = self._get_required_resource_keys(was_provided_resources)\n\n self._config_mapping = None\n self._partitioned_config = None\n self._run_config = None\n self._run_config_schema = None\n self._original_config_argument = config\n\n if partitions_def:\n self._partitioned_config = PartitionedConfig.from_flexible_config(\n config, partitions_def\n )\n else:\n if isinstance(config, ConfigMapping):\n self._config_mapping = config\n elif isinstance(config, PartitionedConfig):\n self._partitioned_config = config\n elif isinstance(config, dict):\n self._run_config = config\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n self._config_mapping = _config_mapping_with_default_value(\n get_run_config_schema_for_job(\n graph_def,\n self.resource_defs,\n self.executor_def,\n self.loggers,\n asset_layer,\n was_explicitly_provided_resources=was_provided_resources,\n ),\n config,\n self.name,\n )\n elif config is not None:\n check.failed(\n "config param must be a ConfigMapping, a PartitionedConfig, or a dictionary,"\n f" but is an object of type {type(config)}"\n )\n\n self._subset_selection_data = _subset_selection_data\n self.input_values = input_values\n for input_name in sorted(list(self.input_values.keys())):\n if not graph_def.has_input(input_name):\n raise DagsterInvalidDefinitionError(\n f"Error when constructing JobDefinition '{self.name}': Input value provided for"\n f" key '{input_name}', but job has no top-level input with that name."\n )\n\n def dagster_internal_init(\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n executor_def: Optional[ExecutorDefinition],\n logger_defs: Optional[Mapping[str, LoggerDefinition]],\n name: Optional[str],\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ],\n description: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n tags: Optional[Mapping[str, Any]],\n metadata: Optional[Mapping[str, RawMetadataValue]],\n hook_defs: Optional[AbstractSet[HookDefinition]],\n op_retry_policy: Optional[RetryPolicy],\n version_strategy: Optional[VersionStrategy],\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]],\n asset_layer: Optional[AssetLayer],\n input_values: Optional[Mapping[str, object]],\n _was_explicitly_provided_resources: Optional[bool],\n ) -> "JobDefinition":\n return JobDefinition(\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n name=name,\n config=config,\n description=description,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hook_defs,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n _subset_selection_data=_subset_selection_data,\n asset_layer=asset_layer,\n input_values=input_values,\n _was_explicitly_provided_resources=_was_explicitly_provided_resources,\n )\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def tags(self) -> Mapping[str, str]:\n return merge_dicts(self._graph_def.tags, self._tags)\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def graph(self) -> GraphDefinition:\n return self._graph_def\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._graph_def.dependencies\n\n @public\n @property\n def executor_def(self) -> ExecutorDefinition:\n """Returns the default :py:class:`ExecutorDefinition` for the job.\n\n If the user has not specified an executor definition, then this will default to the :py:func:`multi_or_in_process_executor`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n return self._executor_def or DEFAULT_EXECUTOR_DEF\n\n @public\n @property\n def has_specified_executor(self) -> bool:\n """Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._executor_def is not None\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Returns the set of ResourceDefinition objects specified on the job.\n\n This may not be the complete set of resources required by the job, since those can also be provided on the :py:class:`Definitions` object the job may be provided to.\n """\n return self._resource_defs\n\n @public\n @property\n def partitioned_config(self) -> Optional[PartitionedConfig]:\n """The partitioned config for the job, if it has one.\n\n A partitioned config defines a way to map partition keys to run config for the job.\n """\n return self._partitioned_config\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the job, if it has one.\n\n A config mapping defines a way to map a top-level config schema to run config for the job.\n """\n return self._config_mapping\n\n @public\n @property\n def loggers(self) -> Mapping[str, LoggerDefinition]:\n """Returns the set of LoggerDefinition objects specified on the job.\n\n If the user has not specified a mapping of :py:class:`LoggerDefinition` objects, then this will default to the :py:func:`colored_console_logger` under the key `console`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n from dagster._loggers import default_loggers\n\n return self._loggers or default_loggers()\n\n @public\n @property\n def has_specified_loggers(self) -> bool:\n """Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._loggers is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def run_config(self) -> Optional[Mapping[str, Any]]:\n return self._run_config\n\n @property\n def run_config_schema(self) -> "RunConfigSchema":\n if self._run_config_schema is None:\n self._run_config_schema = _create_run_config_schema(self, self.required_resource_keys)\n return self._run_config_schema\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Returns the :py:class:`PartitionsDefinition` for the job, if it has one.\n\n A partitions definition defines the set of partition keys the job operates on.\n """\n return None if not self.partitioned_config else self.partitioned_config.partitions_def\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def asset_layer(self) -> AssetLayer:\n return self._asset_layer\n\n @property\n def all_node_defs(self) -> Sequence[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_node_defs(self) -> Sequence[NodeDefinition]:\n return self._current_level_node_defs\n\n def node_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, f"{name} not found")\n return self._all_node_defs[name]\n\n def has_node(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_node(self, handle: NodeHandle) -> Node:\n return self._graph_def.get_node(handle)\n\n def get_op(self, handle: NodeHandle) -> OpNode:\n node = self.get_node(handle)\n assert isinstance(\n node, OpNode\n ), f"Tried to retrieve node {handle} as op, but it represents a nested graph."\n return node\n\n def has_node_named(self, name: str) -> bool:\n return self._graph_def.has_node_named(name)\n\n def get_node_named(self, name: str) -> Node:\n return self._graph_def.node_named(name)\n\n @property\n def nodes(self) -> Sequence[Node]:\n return self._graph_def.nodes\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._graph_def.nodes_in_topological_order\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name: str) -> bool:\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name: str) -> DagsterType:\n return self._graph_def.dagster_type_named(name)\n\n def describe_target(self) -> str:\n return f"job '{self.name}'"\n\n def is_using_memoization(self, run_tags: Mapping[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def get_required_resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.resource_defs.items()\n if resource_key in self.required_resource_keys\n }\n\n def _get_required_resource_keys(self, validate_requirements: bool = False) -> AbstractSet[str]:\n from ..execution.resources_init import get_transitive_required_resource_keys\n\n requirements = self._get_resource_requirements()\n if validate_requirements:\n ensure_requirements_satisfied(self.resource_defs, requirements)\n required_keys = {req.key for req in requirements}\n if validate_requirements:\n return required_keys.union(\n get_transitive_required_resource_keys(required_keys, self.resource_defs)\n )\n else:\n return required_keys\n\n def _get_resource_requirements(self) -> Sequence[ResourceRequirement]:\n return [\n *self._graph_def.get_resource_requirements(self.asset_layer),\n *[\n req\n for hook_def in self._hook_defs\n for req in hook_def.get_resource_requirements(outer_context=f"job '{self._name}'")\n ],\n ]\n\n def validate_resource_requirements_satisfied(self) -> None:\n resource_requirements = self._get_resource_requirements()\n ensure_requirements_satisfied(self.resource_defs, resource_requirements)\n\n def is_missing_required_resources(self) -> bool:\n requirements = self._get_resource_requirements()\n for requirement in requirements:\n if not requirement.resources_contain_key(self.resource_defs):\n return True\n return False\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> AbstractSet[HookDefinition]:\n """Gather all the hooks for the given node from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Node (node invocation)\n * JobDefinition\n\n Args:\n handle (NodeHandle): The node's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: Set[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level node\n name = lineage.pop()\n node = self._graph_def.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks on non-top-level nodes\n while lineage:\n name = lineage.pop()\n # While lineage is non-empty, definition is guaranteed to be a graph\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks applied to a job definition will run on every node\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n node = self.get_node(handle)\n definition = node.definition\n\n if node.retry_policy:\n return node.retry_policy\n elif isinstance(definition, OpDefinition) and definition.retry_policy:\n return definition.retry_policy\n\n # could be expanded to look in graph containers\n else:\n return self._op_retry_policy\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n raise DagsterInvariantViolationError(\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Optional[Union[Mapping[str, Any], "RunConfig"]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n resources: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Mapping[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[Sequence[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster._core.definitions.executor_definition import execute_in_process_executor\n from dagster._core.definitions.run_config import convert_config_input\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n asset_selection = check.opt_sequence_param(asset_selection, "asset_selection", AssetKey)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n resource_defs = wrap_resources_for_execution(resources)\n\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to"\n " execute_in_process",\n )\n\n partition_key = check.opt_str_param(partition_key, "partition_key")\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n # Combine provided input values at execute_in_process with input values\n # provided to the definition. Input values provided at\n # execute_in_process will override those provided on the definition.\n input_values = merge_dicts(self.input_values, input_values)\n\n bound_resource_defs = dict(self.resource_defs)\n ephemeral_job = JobDefinition.dagster_internal_init(\n name=self._name,\n graph_def=self._graph_def,\n resource_defs={**_swap_default_io_man(bound_resource_defs, self), **resource_defs},\n executor_def=execute_in_process_executor,\n logger_defs=self._loggers,\n hook_defs=self.hook_defs,\n config=self.config_mapping or self.partitioned_config or self.run_config,\n tags=self.tags,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n asset_layer=self.asset_layer,\n input_values=input_values,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.metadata,\n _subset_selection_data=None, # this is added below\n _was_explicitly_provided_resources=True,\n )\n\n ephemeral_job = ephemeral_job.get_subset(\n op_selection=op_selection,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n merged_tags = merge_dicts(self.tags, tags or {})\n if partition_key:\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Attempted to execute a partitioned run for a non-partitioned job")\n self.partitions_def.validate_partition_key(\n partition_key, dynamic_partitions_store=instance\n )\n\n run_config = (\n run_config\n if run_config\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n merged_tags.update(\n self.partitioned_config.get_tags_for_partition_key(\n partition_key, job_name=self.name\n )\n )\n\n return core_execute_in_process(\n ephemeral_job=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=merged_tags,\n run_id=run_id,\n asset_selection=frozenset(asset_selection),\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, OpSelectionData)\n else None\n )\n\n @property\n def asset_selection_data(self) -> Optional[AssetSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, AssetSelectionData)\n else None\n )\n\n @property\n def is_subset(self) -> bool:\n return bool(self._subset_selection_data)\n\n def get_subset(\n self,\n *,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n check.invariant(\n not (op_selection and (asset_selection or asset_check_selection)),\n "op_selection cannot be provided with asset_selection or asset_check_selection to"\n " execute_in_process",\n )\n if op_selection:\n return self._get_job_def_for_op_selection(op_selection)\n if asset_selection or asset_check_selection:\n return self._get_job_def_for_asset_selection(\n asset_selection=asset_selection, asset_check_selection=asset_check_selection\n )\n else:\n return self\n\n def _get_job_def_for_asset_selection(\n self,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n asset_selection = check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", AssetCheckKey)\n\n nonexistent_assets = [\n asset\n for asset in asset_selection\n if asset not in self.asset_layer.asset_keys\n and asset not in self.asset_layer.source_assets_by_key\n ]\n nonexistent_asset_strings = [\n asset_str\n for asset_str in (asset.to_string() for asset in nonexistent_assets)\n if asset_str\n ]\n if nonexistent_assets:\n raise DagsterInvalidSubsetError(\n "Assets provided in asset_selection argument "\n f"{', '.join(nonexistent_asset_strings)} do not exist in parent asset group or job."\n )\n\n # Test that selected asset checks exist\n all_check_keys = self.asset_layer.node_output_handles_by_asset_check_key.keys()\n\n nonexistent_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys\n ]\n nonexistent_asset_check_strings = [\n str(asset_check) for asset_check in nonexistent_asset_checks\n ]\n if nonexistent_asset_checks:\n raise DagsterInvalidSubsetError(\n "Asset checks provided in asset_check_selection argument"\n f" {', '.join(nonexistent_asset_check_strings)} do not exist in parent asset group"\n " or job."\n )\n\n asset_selection_data = AssetSelectionData(\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n parent_job_def=self,\n )\n\n check.invariant(\n self.asset_layer.assets_defs_by_key is not None,\n "Asset layer must have _asset_defs argument defined",\n )\n\n new_job = build_asset_selection_job(\n name=self.name,\n assets=set(self.asset_layer.assets_defs_by_key.values()),\n source_assets=self.asset_layer.source_assets_by_key.values(),\n executor_def=self.executor_def,\n resource_defs=self.resource_defs,\n description=self.description,\n tags=self.tags,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n asset_selection_data=asset_selection_data,\n config=self.config_mapping or self.partitioned_config,\n asset_checks=self.asset_layer.asset_checks_defs,\n )\n return new_job\n\n def _get_job_def_for_op_selection(self, op_selection: Iterable[str]) -> Self:\n try:\n sub_graph = get_graph_subset(self.graph, op_selection)\n\n # if explicit config was passed the config_mapping that resolves the defaults implicitly is\n # very unlikely to work. The job will still present the default config in the Dagster UI.\n config = (\n None\n if self.run_config is not None\n else self.config_mapping or self.partitioned_config\n )\n\n return self._copy(\n config=config,\n graph_def=sub_graph,\n _subset_selection_data=OpSelectionData(\n op_selection=list(op_selection),\n resolved_op_selection=OpSelection(op_selection).resolve(self.graph),\n parent_job_def=self, # used by job snapshot lineage\n ),\n # TODO: subset this structure.\n # https://github.com/dagster-io/dagster/issues/7541\n asset_layer=self.asset_layer,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n node_paths = OpSelection(op_selection).resolve(self.graph)\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(node_paths)} for graph "\n f"{self.graph.name} results in an invalid graph."\n ) from exc\n\n
[docs] @public\n @deprecated(\n breaking_version="2.0.0",\n additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n )\n def run_request_for_partition(\n self,\n partition_key: str,\n run_key: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional["DynamicPartitionsStore"] = None,\n ) -> RunRequest:\n """Creates a RunRequest object for a run that processes the given partition.\n\n Args:\n partition_key: The key of the partition to request a run for.\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n current_time (Optional[datetime]): Used to determine which time-partitions exist.\n Defaults to now.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n\n Returns:\n RunRequest: an object that requests a run to process the given partition.\n """\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n if (\n isinstance(self.partitions_def, DynamicPartitionsDefinition)\n and self.partitions_def.name\n ):\n # Do not support using run_request_for_partition with dynamic partitions,\n # since this requires querying the instance once per run request for the\n # existent dynamic partitions\n check.failed(\n "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n " RunRequest(partition_key=...)"\n )\n\n self.partitions_def.validate_partition_key(\n partition_key,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n run_config = (\n run_config\n if run_config is not None\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n run_request_tags = {\n **(tags or {}),\n **self.partitioned_config.get_tags_for_partition_key(\n partition_key,\n job_name=self.name,\n ),\n }\n\n return RunRequest(\n run_key=run_key,\n run_config=run_config,\n tags=run_request_tags,\n job_name=self.name,\n asset_selection=asset_selection,\n partition_key=partition_key,\n )
\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_job_snapshot().config_schema_snapshot\n\n def get_job_snapshot(self) -> "JobSnapshot":\n return self.get_job_index().job_snapshot\n\n def get_job_index(self) -> "JobIndex":\n from dagster._core.host_representation import JobIndex\n from dagster._core.snap import JobSnapshot\n\n return JobIndex(JobSnapshot.from_job_def(self), self.get_parent_job_snapshot())\n\n def get_job_snapshot_id(self) -> str:\n return self.get_job_index().job_snapshot_id\n\n def get_parent_job_snapshot(self) -> Optional["JobSnapshot"]:\n if self.op_selection_data:\n return self.op_selection_data.parent_job_def.get_job_snapshot()\n elif self.asset_selection_data:\n return self.asset_selection_data.parent_job_def.get_job_snapshot()\n else:\n return None\n\n def has_direct_input_value(self, input_name: str) -> bool:\n return input_name in self.input_values\n\n def get_direct_input_value(self, input_name: str) -> object:\n if input_name not in self.input_values:\n raise DagsterInvalidInvocationError(\n f"On job '{self.name}', attempted to retrieve input value for input named"\n f" '{input_name}', but no value was provided. Provided input values:"\n f" {sorted(list(self.input_values.keys()))}"\n )\n return self.input_values[input_name]\n\n def _copy(self, **kwargs: Any) -> "JobDefinition":\n # dict() calls copy dict props\n base_kwargs = dict(\n graph_def=self.graph,\n resource_defs=dict(self.resource_defs),\n executor_def=self._executor_def,\n logger_defs=self._loggers,\n config=self._original_config_argument,\n name=self._name,\n description=self.description,\n tags=self.tags,\n metadata=self._metadata,\n hook_defs=self.hook_defs,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n _subset_selection_data=self._subset_selection_data,\n asset_layer=self.asset_layer,\n input_values=self.input_values,\n partitions_def=self.partitions_def,\n _was_explicitly_provided_resources=None,\n )\n resolved_kwargs = {**base_kwargs, **kwargs} # base kwargs overwritten for conflicts\n job_def = JobDefinition.dagster_internal_init(**resolved_kwargs)\n update_wrapper(job_def, self, updated=())\n return job_def\n\n
[docs] @public\n def with_top_level_resources(\n self, resource_defs: Mapping[str, ResourceDefinition]\n ) -> "JobDefinition":\n """Apply a set of resources to all op instances within the job."""\n resource_defs = check.mapping_param(resource_defs, "resource_defs", key_type=str)\n return self._copy(resource_defs=resource_defs)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n return self._copy(hook_defs=(hook_defs | self.hook_defs))
\n\n def with_executor_def(self, executor_def: ExecutorDefinition) -> "JobDefinition":\n return self._copy(executor_def=executor_def)\n\n def with_logger_defs(self, logger_defs: Mapping[str, LoggerDefinition]) -> "JobDefinition":\n return self._copy(logger_defs=logger_defs)\n\n @property\n def op_selection(self) -> Optional[AbstractSet[str]]:\n return set(self.op_selection_data.op_selection) if self.op_selection_data else None\n\n @property\n def asset_selection(self) -> Optional[AbstractSet[AssetKey]]:\n return self.asset_selection_data.asset_selection if self.asset_selection_data else None\n\n @property\n def asset_check_selection(self) -> Optional[AbstractSet[AssetCheckKey]]:\n return (\n self.asset_selection_data.asset_check_selection if self.asset_selection_data else None\n )\n\n @property\n def resolved_op_selection(self) -> Optional[AbstractSet[str]]:\n return self.op_selection_data.resolved_op_selection if self.op_selection_data else None
\n\n\ndef _swap_default_io_man(resources: Mapping[str, ResourceDefinition], job: JobDefinition):\n """Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster._core.storage.mem_io_manager import mem_io_manager\n\n if (\n resources.get(DEFAULT_IO_MANAGER_KEY) in [default_job_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources[DEFAULT_IO_MANAGER_KEY] = mem_io_manager\n return updated_resources\n\n return resources\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling."\n)\ndef default_job_io_manager(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n\n # normally, default to the fs_io_manager\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n instance = check.not_none(init_context.instance)\n return PickledObjectFilesystemIOManager(base_dir=instance.storage_directory())\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n config_schema={"base_dir": Field(StringSource, is_required=False)},\n)\ndef default_job_io_manager_with_fs_io_manager_schema(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n # normally, default to the fs_io_manager\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory() if init_context.instance else None\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Mapping[str, Any],\n job_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description=(\n "This run config schema was automatically populated with default values "\n "from `default_config`."\n ),\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\ndef get_run_config_schema_for_job(\n graph_def: GraphDefinition,\n resource_defs: Mapping[str, ResourceDefinition],\n executor_def: "ExecutorDefinition",\n logger_defs: Mapping[str, LoggerDefinition],\n asset_layer: Optional[AssetLayer],\n was_explicitly_provided_resources: bool = False,\n) -> ConfigType:\n return JobDefinition(\n name=graph_def.name,\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n asset_layer=asset_layer,\n _was_explicitly_provided_resources=was_explicitly_provided_resources,\n ).run_config_schema.run_config_schema_type\n\n\ndef _infer_asset_layer_from_source_asset_deps(job_graph_def: GraphDefinition) -> AssetLayer:\n """For non-asset jobs that have some inputs that are fed from SourceAssets, constructs an\n AssetLayer that includes those SourceAssets.\n """\n asset_keys_by_node_input_handle: Dict[NodeInputHandle, AssetKey] = {}\n source_assets_list = []\n source_asset_keys_set = set()\n io_manager_keys_by_asset_key: Mapping[AssetKey, str] = {}\n\n # each entry is a graph definition and its handle relative to the job root\n stack: List[Tuple[GraphDefinition, Optional[NodeHandle]]] = [(job_graph_def, None)]\n\n while stack:\n graph_def, parent_node_handle = stack.pop()\n\n for node_name, input_source_assets in graph_def.node_input_source_assets.items():\n node_handle = NodeHandle(node_name, parent_node_handle)\n for input_name, source_asset in input_source_assets.items():\n if source_asset.key not in source_asset_keys_set:\n source_asset_keys_set.add(source_asset.key)\n source_assets_list.append(source_asset)\n\n input_handle = NodeInputHandle(node_handle, input_name)\n asset_keys_by_node_input_handle[input_handle] = source_asset.key\n for resolved_input_handle in graph_def.node_dict[\n node_name\n ].definition.resolve_input_to_destinations(input_handle):\n asset_keys_by_node_input_handle[resolved_input_handle] = source_asset.key\n\n if source_asset.io_manager_key:\n io_manager_keys_by_asset_key[source_asset.key] = source_asset.io_manager_key\n\n for node_name, node in graph_def.node_dict.items():\n if isinstance(node.definition, GraphDefinition):\n stack.append((node.definition, NodeHandle(node_name, parent_node_handle)))\n\n return AssetLayer(\n assets_defs_by_node_handle={},\n asset_keys_by_node_input_handle=asset_keys_by_node_input_handle,\n asset_info_by_node_output_handle={},\n asset_deps={},\n dependency_node_handles_by_asset_key={},\n assets_defs_by_key={},\n source_assets_by_key={\n source_asset.key: source_asset for source_asset in source_assets_list\n },\n io_manager_keys_by_asset_key=io_manager_keys_by_asset_key,\n dep_asset_keys_by_node_output_handle={},\n partition_mappings_by_asset_dep={},\n asset_checks_defs_by_node_handle={},\n node_output_handles_by_asset_check_key={},\n check_names_by_asset_key_by_node_handle={},\n check_key_by_node_output_handle={},\n )\n\n\ndef _build_all_node_defs(node_defs: Sequence[NodeDefinition]) -> Mapping[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n job_def: JobDefinition,\n required_resources: AbstractSet[str],\n) -> "RunConfigSchema":\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset job, include the missing nodes\n # from the original job as ignored to allow execution with\n # run config that is valid for the original\n ignored_nodes: Sequence[Node] = []\n if job_def.is_subset:\n if isinstance(job_def.graph, SubselectedGraphDefinition): # op selection provided\n ignored_nodes = job_def.graph.get_top_level_omitted_nodes()\n elif job_def.asset_selection_data:\n parent_job = job_def\n while parent_job.asset_selection_data:\n parent_job = parent_job.asset_selection_data.parent_job_def\n\n ignored_nodes = [\n node for node in parent_job.graph.nodes if not job_def.has_node_named(node.name)\n ]\n else:\n ignored_nodes = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n job_name=job_def.name,\n nodes=job_def.graph.nodes,\n graph_def=job_def.graph,\n dependency_structure=job_def.graph.dependency_structure,\n executor_def=job_def.executor_def,\n resource_defs=job_def.resource_defs,\n logger_defs=job_def.loggers,\n ignored_nodes=ignored_nodes,\n required_resources=required_resources,\n direct_inputs=job_def.input_values,\n asset_layer=job_def.asset_layer,\n )\n )\n\n if job_def.config_mapping:\n outer_config_type = job_def.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n job_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=job_def.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.job_definition"}, "load_assets_from_modules": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.load_assets_from_modules

\nimport inspect\nimport os\nimport pkgutil\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import Dict, Generator, Iterable, List, Optional, Sequence, Set, Tuple, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .assets import AssetsDefinition\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\nfrom .source_asset import SourceAsset\n\n\ndef _find_assets_in_module(\n    module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition], None, None]:\n    """Finds assets in the given module and adds them to the given sets of assets and source assets."""\n    for attr in dir(module):\n        value = getattr(module, attr)\n        if isinstance(value, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)):\n            yield value\n        elif isinstance(value, list) and all(\n            isinstance(el, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition))\n            for el in value\n        ):\n            yield from value\n\n\ndef assets_from_modules(\n    modules: Iterable[ModuleType], extra_source_assets: Optional[Sequence[SourceAsset]] = None\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n    """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable\n    assets from the given modules.\n\n    Args:\n        modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n        extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n            group in addition to the source assets found in the modules.\n\n    Returns:\n        Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]]:\n            A tuple containing a list of assets, a list of source assets, and a list of\n            cacheable assets defined in the given modules.\n    """\n    asset_ids: Set[int] = set()\n    asset_keys: Dict[AssetKey, ModuleType] = dict()\n    source_assets: List[SourceAsset] = list(\n        check.opt_sequence_param(extra_source_assets, "extra_source_assets", of_type=SourceAsset)\n    )\n    cacheable_assets: List[CacheableAssetsDefinition] = []\n    assets: Dict[AssetKey, AssetsDefinition] = {}\n    for module in modules:\n        for asset in _find_assets_in_module(module):\n            if id(asset) not in asset_ids:\n                asset_ids.add(id(asset))\n                if isinstance(asset, CacheableAssetsDefinition):\n                    cacheable_assets.append(asset)\n                else:\n                    keys = asset.keys if isinstance(asset, AssetsDefinition) else [asset.key]\n                    for key in keys:\n                        if key in asset_keys:\n                            modules_str = ", ".join(\n                                set([asset_keys[key].__name__, module.__name__])\n                            )\n                            error_str = (\n                                f"Asset key {key} is defined multiple times. Definitions found in"\n                                f" modules: {modules_str}. "\n                            )\n\n                            if key in assets and isinstance(asset, AssetsDefinition):\n                                if assets[key].node_def == asset.node_def:\n                                    error_str += (\n                                        "One possible cause of this bug is a call to with_resources"\n                                        " outside of a repository definition, causing a duplicate"\n                                        " asset definition."\n                                    )\n\n                            raise DagsterInvalidDefinitionError(error_str)\n                        else:\n                            asset_keys[key] = module\n                            if isinstance(asset, AssetsDefinition):\n                                assets[key] = asset\n                    if isinstance(asset, SourceAsset):\n                        source_assets.append(asset)\n    return list(set(assets.values())), source_assets, cacheable_assets\n\n\n
[docs]def load_assets_from_modules(\n modules: Iterable[ModuleType],\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets from the given modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset]]:\n A list containing assets and source assets defined in the given modules.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_modules(modules)\n\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_current_module(\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets from the module where\n this function is called.\n\n Args:\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n\n return load_assets_from_modules(\n [module],\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef assets_from_package_module(\n package_module: ModuleType,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable assets\n from the given package module.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the modules.\n\n Returns:\n Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n A tuple containing a list of assets, a list of source assets, and a list of cacheable assets\n defined in the given modules.\n """\n return assets_from_modules(\n _find_modules_in_package(package_module), extra_source_assets=extra_source_assets\n )\n\n\n
[docs]def load_assets_from_package_module(\n package_module: ModuleType,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets that includes all asset\n definitions, source assets, and cacheable assets in all sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_package_module(package_module)\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_package_name(\n package_name: str,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets that includes all asset\n definitions and source assets in all sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n package_module = import_module(package_name)\n return load_assets_from_package_module(\n package_module,\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried to find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef prefix_assets(\n assets_defs: Sequence[AssetsDefinition],\n key_prefix: CoercibleToAssetKeyPrefix,\n source_assets: Sequence[SourceAsset],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset]]:\n """Given a list of assets, prefix the input and output asset keys with key_prefix.\n The prefix is not added to source assets.\n\n Input asset keys that reference other assets within assets_defs are "brought along" -\n i.e. prefixed as well.\n\n Example with a single asset:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n result = prefixed_asset_key_replacements([asset_1], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n\n Example with dependencies within the list of assets:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n result = prefixed_asset_key_replacements([asset1, asset2], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n assert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[1].dependency_keys == {AssetKey(["my_prefix", "asset1"])}\n\n """\n asset_keys = {asset_key for assets_def in assets_defs for asset_key in assets_def.keys}\n source_asset_keys = {source_asset.key for source_asset in source_assets}\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.is_list(key_prefix, of_type=str)\n\n result_assets: List[AssetsDefinition] = []\n for assets_def in assets_defs:\n output_asset_key_replacements = {\n asset_key: AssetKey([*key_prefix, *asset_key.path]) for asset_key in assets_def.keys\n }\n input_asset_key_replacements = {}\n for dep_asset_key in assets_def.dependency_keys:\n if dep_asset_key in asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*key_prefix, *dep_asset_key.path]\n )\n elif source_key_prefix and dep_asset_key in source_asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*source_key_prefix, *dep_asset_key.path]\n )\n\n result_assets.append(\n assets_def.with_attributes(\n output_asset_key_replacements=output_asset_key_replacements,\n input_asset_key_replacements=input_asset_key_replacements,\n )\n )\n\n if source_key_prefix:\n result_source_assets = [\n source_asset.with_attributes(key=AssetKey([*source_key_prefix, *source_asset.key.path]))\n for source_asset in source_assets\n ]\n else:\n result_source_assets = source_assets\n\n return result_assets, result_source_assets\n\n\ndef assets_with_attributes(\n assets_defs: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n cacheable_assets: Sequence[CacheableAssetsDefinition],\n key_prefix: Optional[Sequence[str]],\n group_name: Optional[str],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n source_key_prefix: Optional[Sequence[str]],\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n # There is a tricky edge case here where if a non-cacheable asset depends on a cacheable asset,\n # and the assets are prefixed, the non-cacheable asset's dependency will not be prefixed since\n # at prefix-time it is not known that its dependency is one of the cacheable assets.\n # https://github.com/dagster-io/dagster/pull/10389#pullrequestreview-1170913271\n if key_prefix:\n assets_defs, source_assets = prefix_assets(\n assets_defs, key_prefix, source_assets, source_key_prefix\n )\n cacheable_assets = [\n cached_asset.with_prefix_for_all(key_prefix) for cached_asset in cacheable_assets\n ]\n\n if group_name or freshness_policy or auto_materialize_policy or backfill_policy:\n assets_defs = [\n asset.with_attributes(\n group_names_by_key=(\n {asset_key: group_name for asset_key in asset.keys} if group_name else None\n ),\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for asset in assets_defs\n ]\n if group_name:\n source_assets = [\n source_asset.with_attributes(group_name=group_name)\n for source_asset in source_assets\n ]\n cacheable_assets = [\n cached_asset.with_attributes_for_all(\n group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for cached_asset in cacheable_assets\n ]\n\n return [*assets_defs, *source_assets, *cacheable_assets]\n
", "current_page_name": "_modules/dagster/_core/definitions/load_assets_from_modules", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.load_assets_from_modules"}, "logger_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.logger_definition

\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .config import is_callable_valid_config_arg\nfrom .configurable import AnonymousConfigurableDefinition\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    import logging\n\n    from dagster._core.definitions import JobDefinition\n    from dagster._core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @public\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n """Callable[[InitLoggerContext], logging.Logger]: The function that will be invoked to\n instantiate the logger.\n """\n return self._logger_fn\n\n @public\n @property\n def config_schema(self) -> Any:\n """Any: The schema for the logger's config. Configuration data available in `init_context.logger_config`."""\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the logger."""\n return self._description\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: Any,\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n@overload\ndef logger(\n config_schema: CoercableToConfigSchema, description: Optional[str] = ...\n) -> Callable[["InitLoggerFunction"], "LoggerDefinition"]: ...\n\n\n@overload\ndef logger(\n config_schema: "InitLoggerFunction", description: Optional[str] = ...\n) -> "LoggerDefinition": ...\n\n\n
[docs]def logger(\n config_schema: Union[CoercableToConfigSchema, "InitLoggerFunction"] = None,\n description: Optional[str] = None,\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=cast("InitLoggerFunction", config_schema))\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster._core.definitions import JobDefinition\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n return UnboundInitLoggerContext(logger_config=logger_config, job_def=job_def)
\n
", "current_page_name": "_modules/dagster/_core/definitions/logger_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.logger_definition"}, "materialize": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.materialize

\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.unresolved_asset_job_definition import define_asset_job\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..errors import DagsterInvariantViolationError\nfrom ..instance import DagsterInstance\nfrom ..storage.io_manager import IOManagerDefinition\nfrom ..storage.mem_io_manager import mem_io_manager\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.events import AssetKey\n\n    from ..execution.execute_in_process_result import ExecuteInProcessResult\n\nEPHEMERAL_JOB_NAME = "__ephemeral_asset_job__"\n\n\n
[docs]def materialize(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets.\n\n By default, will materialize assets to the local filesystem.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize.\n\n Unless you're using `deps` or `non_argument_deps`, you must also include all assets that are\n upstream of the assets that you want to materialize. This is because those upstream\n asset definitions have information that is needed to load their contents while\n materializing the downstream assets.\n\n You can use the `selection` argument to distinguish between assets that you want to\n materialize and assets that are just present for loading.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. Note that if provided resources\n conflict with resources directly on assets, an error will be thrown.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset2, loading its input from asset1\n materialize([asset1, asset2], selection=[asset2])\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n partition_key = check.opt_str_param(partition_key, "partition_key")\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n all_executable_keys: Set[AssetKey] = set()\n for asset in assets:\n if isinstance(asset, AssetsDefinition):\n all_executable_keys = all_executable_keys.union(set(asset.keys))\n\n defs = Definitions(\n jobs=[define_asset_job(name=EPHEMERAL_JOB_NAME, selection=selection)],\n assets=assets,\n resources=resources,\n )\n return check.not_none(\n defs.get_job_def(EPHEMERAL_JOB_NAME),\n "This should always return a job",\n ).execute_in_process(\n run_config=run_config,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n )
\n\n\n
[docs]def materialize_to_memory(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets in memory.\n\n Will explicitly use :py:func:`mem_io_manager` for all required io manager\n keys. If any io managers are directly provided using the `resources`\n argument, a :py:class:`DagsterInvariantViolationError` will be thrown.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize. Can also provide :py:class:`SourceAsset` objects to fill dependencies for asset defs.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. If provided resources\n conflict with resources directly on assets, an error will be thrown.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset1\n materialize([asset1, asset2], selection=[asset1])\n """\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n\n # Gather all resource defs for the purpose of checking io managers.\n resources_dict = resources or {}\n all_resource_keys = set(resources_dict.keys())\n for asset in assets:\n all_resource_keys = all_resource_keys.union(asset.resource_defs.keys())\n\n io_manager_keys = _get_required_io_manager_keys(assets)\n for io_manager_key in io_manager_keys:\n if io_manager_key in all_resource_keys:\n raise DagsterInvariantViolationError(\n "Attempted to call `materialize_to_memory` with a resource "\n f"provided for io manager key '{io_manager_key}'. Do not "\n "provide resources for io manager keys when calling "\n "`materialize_to_memory`, as it will override io management "\n "behavior for all keys."\n )\n\n resource_defs = merge_dicts({key: mem_io_manager for key in io_manager_keys}, resources_dict)\n\n return materialize(\n assets=assets,\n run_config=run_config,\n resources=resource_defs,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n selection=selection,\n )
\n\n\ndef _get_required_io_manager_keys(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]]\n) -> Set[str]:\n io_manager_keys = set()\n for asset in assets:\n for requirement in asset.get_resource_requirements():\n if requirement.expected_type == IOManagerDefinition:\n io_manager_keys.add(requirement.key)\n return io_manager_keys\n
", "current_page_name": "_modules/dagster/_core/definitions/materialize", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.materialize"}, "metadata": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata

\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self, TypeAlias, TypeVar\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, experimental, public\nfrom dagster._core.errors import DagsterInvalidMetadata\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import (\n    FieldSerializer,\n    PackableValue,\n    UnpackContext,\n    WhitelistMap,\n    pack_value,\n)\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom .table import (  # re-exported\n    TableColumn as TableColumn,\n    TableColumnConstraints as TableColumnConstraints,\n    TableConstraints as TableConstraints,\n    TableRecord as TableRecord,\n    TableSchema as TableSchema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import AssetKey\n\nArbitraryMetadataMapping: TypeAlias = Mapping[str, Any]\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    TableSchema,\n    "AssetKey",\n    os.PathLike,\n    Dict[Any, Any],\n    float,\n    int,\n    List[Any],\n    str,\n    None,\n]\n\nMetadataMapping: TypeAlias = Mapping[str, "MetadataValue"]\nMetadataUserInput: TypeAlias = Mapping[str, RawMetadataValue]\n\nT_Packable = TypeVar("T_Packable", bound=PackableValue, default=PackableValue, covariant=True)\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    allow_invalid: bool = False,\n) -> Mapping[str, "MetadataValue"]:\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    normalized_metadata: Dict[str, MetadataValue] = {}\n    for k, v in metadata.items():\n        try:\n            normalized_value = normalize_metadata_value(v)\n        except DagsterInvalidMetadata as e:\n            if allow_invalid:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "2.0.0",\n                    additional_warn_text=(\n                        "In the future, all user-supplied metadata values must be one of"\n                        f" {RawMetadataValue}"\n                    ),\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                normalized_value = TextMetadataValue(f"[{v.__class__.__name__}] (unserializable)")\n            else:\n                raise DagsterInvalidMetadata(\n                    f'Could not resolve the metadata value for "{k}" to a known type. {e}'\n                ) from None\n        normalized_metadata[k] = normalized_value\n\n    return normalized_metadata\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue) -> "MetadataValue[Any]":\n    from dagster._core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, bool):\n        return MetadataValue.bool(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, (list, dict)):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n    elif raw_value is None:\n        return MetadataValue.null()\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue(ABC, Generic[T_Packable]):\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in the Dagster UI and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n @public\n @property\n @abstractmethod\n def value(self) -> T_Packable:\n """The wrapped value."""\n raise NotImplementedError()\n\n
[docs] @public\n @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @public\n @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @public\n @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def notebook(path: Union[str, os.PathLike]) -> "NotebookMetadataValue":\n """Static constructor for a metadata value wrapping a notebook path as\n :py:class:`NotebookMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n }\n )\n\n Args:\n path (str): The path to a notebook for a metadata entry.\n """\n return NotebookMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def json(data: Union[Sequence[Any], Mapping[str, Any]]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a json-serializable list or dict\n as :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Union[Sequence[Any], Mapping[str, Any]]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @public\n @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n return FloatMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n return IntMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def bool(value: bool) -> "BoolMetadataValue":\n """Static constructor for a metadata value wrapping a bool as\n :py:class:`BoolMetadataValuye`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n },\n )\n\n Args:\n value (bool): The bool value for a metadata entry.\n """\n return BoolMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def dagster_run(run_id: str) -> "DagsterRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return DagsterRunMetadataValue(run_id)
\n\n
[docs] @public\n @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n from dagster._core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @public\n @staticmethod\n @experimental\n def table(\n records: Sequence[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"),\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @public\n @staticmethod\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n
[docs] @public\n @staticmethod\n def null() -> "NullMetadataValue":\n """Static constructor for a metadata value representing null. Can be used as the value type\n for the `metadata` parameter for supported events.\n """\n return NullMetadataValue()
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue(\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped text data."""\n return self.text
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue(\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped URL."""\n return self.url
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue(\n NamedTuple("_PathMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="NotebookMetadataEntryData")\nclass NotebookMetadataValue(\n NamedTuple("_NotebookMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for notebook metadata entry data.\n\n Args:\n path (Optional[str]): The path to the notebook as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(NotebookMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path to the notebook as a string."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", PublicAttr[Optional[Union[Sequence[Any], Mapping[str, Any]]]]),\n ],\n ),\n MetadataValue[Union[Sequence[Any], Mapping[str, Any]]],\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Union[Sequence[Any], Dict[str, Any]]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Union[Sequence[Any], Mapping[str, Any]]]):\n data = check.opt_inst_param(data, "data", (Sequence, Mapping))\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)\n\n @public\n @property\n def value(self) -> Optional[Union[Sequence[Any], Mapping[str, Any]]]:\n """Optional[Union[Sequence[Any], Dict[str, Any]]]: The wrapped JSON data."""\n return self.data
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped markdown as a string."""\n return self.md_str
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", PublicAttr[str]),\n ("name", PublicAttr[str]),\n ],\n ),\n MetadataValue["PythonArtifactMetadataValue"],\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )\n\n @public\n @property\n def value(self) -> Self:\n """PythonArtifactMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", PublicAttr[Optional[float]]),\n ],\n ),\n MetadataValue[float],\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", PublicAttr[Optional[int]]),\n ],\n ),\n MetadataValue[int],\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n@whitelist_for_serdes(storage_name="BoolMetadataEntryData")\nclass BoolMetadataValue(\n NamedTuple("_BoolMetadataValue", [("value", PublicAttr[Optional[bool]])]),\n MetadataValue[bool],\n):\n """Container class for bool metadata entry data.\n\n Args:\n value (Optional[bool]): The bool value.\n """\n\n def __new__(cls, value: Optional[bool]):\n return super(BoolMetadataValue, cls).__new__(cls, check.opt_bool_param(value, "value"))\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterRunMetadataValue(\n NamedTuple(\n "_DagsterRunMetadataValue",\n [\n ("run_id", PublicAttr[str]),\n ],\n ),\n MetadataValue[str],\n):\n """Representation of a dagster run.\n\n Args:\n run_id (str): The run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterRunMetadataValue, cls).__new__(cls, check.str_param(run_id, "run_id"))\n\n @public\n @property\n def value(self) -> str:\n """str: The wrapped run id."""\n return self.run_id
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", PublicAttr["AssetKey"])]),\n MetadataValue["AssetKey"],\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster._core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n @public\n @property\n def value(self) -> "AssetKey":\n """AssetKey: The wrapped :py:class:`AssetKey`."""\n return self.asset_key
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", PublicAttr[Sequence[TableRecord]]),\n ("schema", PublicAttr[TableSchema]),\n ],\n ),\n MetadataValue["TableMetadataValue"],\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n
[docs] @public\n @staticmethod\n def infer_column_type(value: object) -> str:\n """str: Infer the :py:class:`TableSchema` column type that will be used for a value."""\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"
\n\n def __new__(cls, records: Sequence[TableRecord], schema: Optional[TableSchema]):\n check.sequence_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )\n\n @public\n @property\n def value(self) -> Self:\n """TableMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", PublicAttr[TableSchema])]),\n MetadataValue[TableSchema],\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )\n\n @public\n @property\n def value(self) -> TableSchema:\n """TableSchema: The wrapped :py:class:`TableSchema`."""\n return self.schema
\n\n\n@whitelist_for_serdes(storage_name="NullMetadataEntryData")\nclass NullMetadataValue(NamedTuple("_NullMetadataValue", []), MetadataValue[None]):\n """Representation of null."""\n\n @public\n @property\n def value(self) -> None:\n """None: The wrapped null value."""\n return None\n\n\n# ########################\n# ##### METADATA BACKCOMPAT\n# ########################\n\n# Metadata used to be represented as a `List[MetadataEntry]`, but that class has been deleted. But\n# we still serialize metadata dicts to the serialized representation of `List[MetadataEntry]` for\n# backcompat purposes.\n\n\nclass MetadataFieldSerializer(FieldSerializer):\n """Converts between metadata dict (new) and metadata entries list (old)."""\n\n storage_name = "metadata_entries"\n loaded_name = "metadata"\n\n def pack(\n self,\n metadata_dict: Mapping[str, MetadataValue],\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Sequence[Mapping[str, Any]]:\n return [\n {\n "__class__": "EventMetadataEntry",\n "label": k,\n # MetadataValue itself can't inherit from NamedTuple and so isn't a PackableValue,\n # but one of its subclasses will always be returned here.\n "entry_data": pack_value(v, whitelist_map, descent_path), # type: ignore\n "description": None,\n }\n for k, v in metadata_dict.items()\n ]\n\n def unpack(\n self,\n metadata_entries: List["MetadataEntry"],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> Mapping[str, MetadataValue]:\n return {e.label: e.entry_data for e in metadata_entries}\n\n\nT_MetadataValue = TypeVar("T_MetadataValue", bound=MetadataValue, covariant=True)\n\n\n# NOTE: MetadataEntry is no longer accessible via the public API-- all metadata APIs use metadata\n# dicts. This clas shas only been preserved to adhere strictly to our backcompat guarantees. It is\n# still instantiated in the above `MetadataFieldSerializer` but that can easily be changed.\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use a dict with `MetadataValue` values instead.",\n)\n@deprecated_param(\n param="entry_data", breaking_version="2.0", additional_warn_text="Use `value` instead."\n)\n@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("entry_data", PublicAttr[MetadataValue]),\n ],\n ),\n Generic[T_MetadataValue],\n):\n """A structure for describing metadata for Dagster events.\n\n .. note:: This class is no longer usable in any Dagster API, and will be completely removed in 2.0.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in the Dagster UI and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like the Dagster UI.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n value = cast(\n RawMetadataValue,\n normalize_renamed_param(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data
\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "table": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata.table

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableRecord(\n NamedTuple("TableRecord", [("data", PublicAttr[Mapping[str, Union[str, int, float, bool]]])])\n):\n """Represents one record in a table. Field keys are arbitrary strings-- field values must be\n strings, integers, floats, or bools.\n """\n\n def __new__(cls, data: Mapping[str, Union[str, int, float, bool]]):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", PublicAttr[Sequence["TableColumn"]]),\n ("constraints", PublicAttr["TableConstraints"]),\n ],\n )\n):\n """Representation of a schema for tabular data.\n\n Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: Sequence["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.sequence_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )\n\n
[docs] @public\n @staticmethod\n def from_name_type_dict(name_type_dict: Mapping[str, str]):\n """Constructs a TableSchema from a dictionary whose keys are column names and values are the\n names of data types of those columns.\n """\n return TableSchema(\n columns=[\n TableColumn(name=name, type=type_str) for name, type_str in name_type_dict.items()\n ]\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", PublicAttr[Sequence[str]]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: Sequence[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.sequence_param(other, "other", of_type=str),\n )
\n\n\n_DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", PublicAttr[str]),\n ("type", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("constraints", PublicAttr["TableColumnConstraints"]),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # noqa: A002\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", PublicAttr[bool]),\n ("unique", PublicAttr[bool]),\n ("other", PublicAttr[Optional[Sequence[str]]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[Sequence[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_sequence_param(other, "other"),\n )
\n\n\n_DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata/table", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.definitions.metadata"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.metadata.table"}, "title": "dagster._core.definitions.metadata"}, "multi_asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_asset_sensor_definition

\nimport inspect\nimport json\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._utils import normalize_to_repository\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SensorResult, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n    from dagster._core.storage.event_log.base import EventLogRecord\n\nMAX_NUM_UNCONSUMED_EVENTS = 25\n\n\nclass MultiAssetSensorAssetCursorComponent(\n    NamedTuple(\n        "_MultiAssetSensorAssetCursorComponent",\n        [\n            ("latest_consumed_event_partition", Optional[str]),\n            ("latest_consumed_event_id", Optional[int]),\n            ("trailing_unconsumed_partitioned_event_ids", Dict[str, int]),\n        ],\n    )\n):\n    """A cursor component that is used to track the cursor for a particular asset in a multi-asset\n    sensor.\n\n    Here's an illustration to help explain how this representation works:\n\n    partition_1  ---|----------a----\n    partition_2  -t-----|-x---------\n    partition_3  ----t------|---a---\n\n\n    The "|", "a", "t", and "x" characters represent materialization events.\n    The x-axis is storage_id, which is basically time. The cursor has been advanced to the "|" event\n    for each partition. latest_evaluated_event_partition would be "partition_3", and\n    "latest_evaluated_event_id" would be the storage_id of the "|" event for partition_3.\n\n    The "t" events aren't directly represented in the cursor, because they trail the event that the\n    the cursor for their partition has advanced to. The "a" events aren't directly represented\n    in the cursor, because they occurred after the "latest_evaluated_event_id".  The "x" event is\n    included in "unevaluated_partitioned_event_ids", because it's after the event that the cursor\n    for its partition has advanced to, but trails "latest_evaluated_event_id".\n\n    Attributes:\n        latest_consumed_event_partition (Optional[str]): The partition of the latest consumed event\n            for this asset.\n        latest_consumed_event_id (Optional[int]): The event ID of the latest consumed event for\n            this asset.\n        trailing_unconsumed_partitioned_event_ids (Dict[str, int]): A mapping containing\n            the partition key mapped to the latest unconsumed materialization event for this\n            partition with an ID less than latest_consumed_event_id.\n    """\n\n    def __new__(\n        cls,\n        latest_consumed_event_partition,\n        latest_consumed_event_id,\n        trailing_unconsumed_partitioned_event_ids,\n    ):\n        return super(MultiAssetSensorAssetCursorComponent, cls).__new__(\n            cls,\n            latest_consumed_event_partition=check.opt_str_param(\n                latest_consumed_event_partition, "latest_consumed_event_partition"\n            ),\n            latest_consumed_event_id=check.opt_int_param(\n                latest_consumed_event_id, "latest_consumed_event_id"\n            ),\n            trailing_unconsumed_partitioned_event_ids=check.dict_param(\n                trailing_unconsumed_partitioned_event_ids,\n                "trailing_unconsumed_partitioned_event_ids",\n                key_type=str,\n                value_type=int,\n            ),\n        )\n\n\nclass MultiAssetSensorContextCursor:\n    # Tracks the state of the cursor within the tick, created for utility purposes.\n    # Must call MultiAssetSensorEvaluationContext._update_cursor_after_evaluation at end of tick\n    # to serialize the cursor.\n    def __init__(self, cursor: Optional[str], context: "MultiAssetSensorEvaluationContext"):\n        loaded_cursor = json.loads(cursor) if cursor else {}\n        self._cursor_component_by_asset_key: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n        # The initial latest consumed event ID at the beginning of the tick\n        self.initial_latest_consumed_event_ids_by_asset_key: Dict[str, Optional[int]] = {}\n\n        for str_asset_key, cursor_list in loaded_cursor.items():\n            if len(cursor_list) != 3:\n                # In this case, the cursor object is not a multi asset sensor asset cursor\n                # component. This cursor is maintained by the asset reconciliation sensor.\n                break\n            else:\n                partition_key, event_id, trailing_unconsumed_partitioned_event_ids = cursor_list\n                self._cursor_component_by_asset_key[str_asset_key] = (\n                    MultiAssetSensorAssetCursorComponent(\n                        latest_consumed_event_partition=partition_key,\n                        latest_consumed_event_id=event_id,\n                        trailing_unconsumed_partitioned_event_ids=trailing_unconsumed_partitioned_event_ids,\n                    )\n                )\n\n                self.initial_latest_consumed_event_ids_by_asset_key[str_asset_key] = event_id\n\n        check.dict_param(self._cursor_component_by_asset_key, "unpacked_cursor", key_type=str)\n        self._context = context\n\n    def get_cursor_for_asset(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n        return self._cursor_component_by_asset_key.get(\n            str(asset_key), MultiAssetSensorAssetCursorComponent(None, None, {})\n        )\n\n    def get_stringified_cursor(self) -> str:\n        return json.dumps(self._cursor_component_by_asset_key)\n\n\n
[docs]@experimental\nclass MultiAssetSensorEvaluationContext(SensorEvaluationContext):\n """The context object available as the argument to the evaluation function of a\n :py:class:`dagster.MultiAssetSensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_multi_asset_sensor_context`.\n\n The `MultiAssetSensorEvaluationContext` contains a cursor object that tracks the state of\n consumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\n of the latest materialization that has been marked as "consumed" (via a call to `advance_cursor`)\n in a `latest_consumed_event_id` field.\n\n For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\n partitions. Each event ID must be before the `latest_consumed_event_id` field for the asset.\n\n Events marked as consumed via `advance_cursor` will be returned in future ticks until they\n are marked as consumed.\n\n To update the cursor to the latest materialization and clear the unconsumed events, call\n `advance_all_cursors`.\n\n Attributes:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\n strings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\n trailing_unconsumed_partitioned_event_ids).\n last_completion_time (float): DEPRECATED The last time that the sensor was consumed (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Example:\n .. code-block:: python\n\n from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n @multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\n def the_sensor(context: MultiAssetSensorEvaluationContext):\n ...\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"],\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n instance: Optional[DagsterInstance] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n self._monitored_asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n repo_assets = self._repository_def.assets_defs_by_key.values()\n repo_source_assets = self._repository_def.source_assets_by_key.values()\n self._monitored_asset_keys = list(\n monitored_assets.resolve([*repo_assets, *repo_source_assets])\n )\n else:\n self._monitored_asset_keys = monitored_assets\n\n self._assets_by_key: Dict[AssetKey, Optional[AssetsDefinition]] = {}\n self._partitions_def_by_asset_key: Dict[AssetKey, Optional[PartitionsDefinition]] = {}\n for asset_key in self._monitored_asset_keys:\n assets_def = self._repository_def.assets_defs_by_key.get(asset_key)\n self._assets_by_key[asset_key] = assets_def\n\n source_asset_def = self._repository_def.source_assets_by_key.get(asset_key)\n self._partitions_def_by_asset_key[asset_key] = (\n assets_def.partitions_def\n if assets_def\n else source_asset_def.partitions_def if source_asset_def else None\n )\n\n # Cursor object with utility methods for updating and retrieving cursor information.\n # At the end of each tick, must call update_cursor_after_evaluation to update the serialized\n # cursor.\n self._unpacked_cursor = MultiAssetSensorContextCursor(cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n\n self._initial_unconsumed_events_by_id: Dict[int, EventLogRecord] = {}\n self._fetched_initial_unconsumed_events = False\n\n super(MultiAssetSensorEvaluationContext, self).__init__(\n instance_ref=instance_ref,\n last_completion_time=last_completion_time,\n last_run_key=last_run_key,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n resources=resource_defs,\n )\n\n def _cache_initial_unconsumed_events(self) -> None:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n # This method caches the initial unconsumed events for each asset key. To generate the\n # current unconsumed events, call get_trailing_unconsumed_events instead.\n if self._fetched_initial_unconsumed_events:\n return\n\n for asset_key in self._monitored_asset_keys:\n unconsumed_event_ids = list(\n self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values()\n )\n if unconsumed_event_ids:\n event_records = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n storage_ids=unconsumed_event_ids,\n )\n )\n self._initial_unconsumed_events_by_id.update(\n {event_record.storage_id: event_record for event_record in event_records}\n )\n\n self._fetched_initial_unconsumed_events = True\n\n def _get_unconsumed_events_with_ids(\n self, event_ids: Sequence[int]\n ) -> Sequence["EventLogRecord"]:\n self._cache_initial_unconsumed_events()\n unconsumed_events = []\n for event_id in sorted(event_ids):\n event = self._initial_unconsumed_events_by_id.get(event_id)\n unconsumed_events.extend([event] if event else [])\n\n return unconsumed_events\n\n
[docs] @public\n def get_trailing_unconsumed_events(self, asset_key: AssetKey) -> Sequence["EventLogRecord"]:\n """Fetches the unconsumed events for a given asset key. Returns only events\n before the latest consumed event ID for the given asset. To mark an event as consumed,\n pass the event to `advance_cursor`. Returns events in ascending order by storage ID.\n\n Args:\n asset_key (AssetKey): The asset key to get unconsumed events for.\n\n Returns:\n Sequence[EventLogRecord]: The unconsumed events for the given asset key.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )
\n\n def _get_partitions_after_cursor(self, asset_key: AssetKey) -> Sequence[str]:\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(f"No partitions defined for asset key {asset_key}")\n\n partitions_to_fetch = list(\n partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n )\n\n if partition_key is not None:\n # Return partitions after the cursor partition, not including the cursor partition\n partitions_to_fetch = partitions_to_fetch[\n partitions_to_fetch.index(partition_key) + 1 :\n ]\n return partitions_to_fetch\n\n def update_cursor_after_evaluation(self) -> None:\n """Updates the cursor after the sensor evaluation function has been called. This method\n should be called at most once per evaluation.\n """\n new_cursor = self._cursor_advance_state_mutation.get_cursor_with_advances(\n self, self._unpacked_cursor\n )\n\n if new_cursor is not None:\n # Cursor was not updated by this context object, so we do not need to update it\n self._cursor = new_cursor\n self._unpacked_cursor = MultiAssetSensorContextCursor(new_cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n self._fetched_initial_unconsumed_events = False\n\n
[docs] @public\n def latest_materialization_records_by_key(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n ) -> Mapping[AssetKey, Optional["EventLogRecord"]]:\n """Fetches the most recent materialization event record for each asset in asset_keys.\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): list of asset keys to fetch events for. If\n not specified, the latest materialization will be fetched for all assets the\n multi_asset_sensor monitors.\n\n Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest\n materialization event for the asset. If there is no materialization event for the asset,\n the value in the mapping will be None.\n """\n # Do not evaluate unconsumed events, only events newer than the cursor\n # if there are no new events after the cursor, the cursor points to the most\n # recent event.\n\n if asset_keys is None:\n asset_keys = self._monitored_asset_keys\n else:\n asset_keys = check.opt_sequence_param(asset_keys, "asset_keys", of_type=AssetKey)\n\n asset_records = self.instance.get_asset_records(asset_keys)\n\n asset_event_records: Dict[AssetKey, Optional[EventLogRecord]] = {\n asset_key: None for asset_key in asset_keys\n }\n for record in asset_records:\n if (\n record.asset_entry.last_materialization_record\n and record.asset_entry.last_materialization_record.storage_id\n > (self._get_cursor(record.asset_entry.asset_key).latest_consumed_event_id or 0)\n ):\n asset_event_records[record.asset_entry.asset_key] = (\n record.asset_entry.last_materialization_record\n )\n\n return asset_event_records
\n\n
[docs] @public\n def materialization_records_for_key(\n self, asset_key: AssetKey, limit: Optional[int] = None\n ) -> Iterable["EventLogRecord"]:\n """Fetches asset materialization event records for asset_key, with the earliest event first.\n\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_key (AssetKey): The asset to fetch materialization events for\n limit (Optional[int]): The number of events to fetch\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(f"Asset key {asset_key} not monitored by sensor.")\n\n events = list(\n self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n limit=limit,\n )\n )\n\n return events
\n\n def _get_cursor(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n """Returns the MultiAssetSensorAssetCursorComponent for the asset key.\n\n For more information, view the docstring for the MultiAssetSensorAssetCursorComponent class.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._unpacked_cursor.get_cursor_for_asset(asset_key)\n\n
[docs] @public\n def latest_materialization_records_by_partition(\n self,\n asset_key: AssetKey,\n after_cursor_partition: Optional[bool] = False,\n ) -> Mapping[str, "EventLogRecord"]:\n """Given an asset, returns a mapping of partition key to the latest materialization event\n for that partition. Fetches only materializations that have not been marked as "consumed"\n via a call to `advance_cursor`.\n\n Args:\n asset_key (AssetKey): The asset to fetch events for.\n after_cursor_partition (Optional[bool]): If True, only materializations with partitions\n after the cursor's current partition will be returned. By default, set to False.\n\n Returns:\n Mapping[str, EventLogRecord]:\n Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\n EventLogRecord is the most recent materialization event for the partition.\n The mapping preserves the order that the materializations occurred.\n\n Example:\n .. code-block:: python\n\n @asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\n def july_asset():\n return 1\n\n @multi_asset_sensor(asset_keys=[july_asset.key])\n def my_sensor(context):\n context.latest_materialization_records_by_partition(july_asset.key)\n\n # After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n # returns {"2022-07-05": EventLogRecord(...)}\n\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventLogRecord, EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor definition"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvariantViolationError(\n "Cannot get latest materialization by partition for assets with no partitions"\n )\n\n partitions_to_fetch = (\n self._get_partitions_after_cursor(asset_key)\n if after_cursor_partition\n else list(partitions_def.get_partition_keys(dynamic_partitions_store=self.instance))\n )\n\n # Retain ordering of materializations\n materialization_by_partition: Dict[str, EventLogRecord] = OrderedDict()\n\n # Add unconsumed events to the materialization by partition dictionary\n # These events came before the cursor, so should be inserted in storage ID ascending order\n for unconsumed_event in sorted(\n self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )\n ):\n partition = unconsumed_event.partition_key\n if isinstance(partition, str) and partition in partitions_to_fetch:\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = unconsumed_event\n\n partition_materializations = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions_to_fetch,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n )\n for materialization in partition_materializations:\n partition = materialization.partition_key\n\n if isinstance(partition, str):\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = materialization\n\n return materialization_by_partition
\n\n
[docs] @public\n def latest_materialization_records_by_partition_and_asset(\n self,\n ) -> Mapping[str, Mapping[AssetKey, "EventLogRecord"]]:\n """Finds the most recent unconsumed materialization for each partition for each asset\n monitored by the sensor. Aggregates all materializations into a mapping of partition key\n to a mapping of asset key to the materialization event for that partition.\n\n For example, if the sensor monitors two partitioned assets A and B that are materialized\n for partition_x after the cursor, this function returns:\n\n .. code-block:: python\n\n {\n "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n }\n\n This method can only be called when all monitored assets are partitioned and share\n the same partition definition.\n """\n partitions_defs = list(self._partitions_def_by_asset_key.values())\n if not partitions_defs or not all(x == partitions_defs[0] for x in partitions_defs):\n raise DagsterInvalidInvocationError(\n "All assets must be partitioned and share the same partitions definition"\n )\n\n asset_and_materialization_tuple_by_partition: Dict[\n str, Dict[AssetKey, "EventLogRecord"]\n ] = defaultdict(dict)\n\n for asset_key in self._monitored_asset_keys:\n materialization_by_partition = self.latest_materialization_records_by_partition(\n asset_key\n )\n for partition, materialization in materialization_by_partition.items():\n asset_and_materialization_tuple_by_partition[partition][asset_key] = materialization\n\n return asset_and_materialization_tuple_by_partition
\n\n
[docs] @public\n def get_cursor_partition(self, asset_key: Optional[AssetKey]) -> Optional[str]:\n """A utility method to get the current partition the cursor is on."""\n asset_key = check.opt_inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvalidInvocationError(\n "Provided asset key must correspond to a provided asset"\n )\n if asset_key:\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n elif self._monitored_asset_keys is not None and len(self._monitored_asset_keys) == 1:\n partition_key = self._get_cursor(\n self._monitored_asset_keys[0]\n ).latest_consumed_event_partition\n else:\n raise DagsterInvalidInvocationError(\n "Asset key must be provided when multiple assets are defined"\n )\n\n return partition_key
\n\n
[docs] @public\n def all_partitions_materialized(\n self, asset_key: AssetKey, partitions: Optional[Sequence[str]] = None\n ) -> bool:\n """A utility method to check if a provided list of partitions have been materialized\n for a particular asset. This method ignores the cursor and checks all materializations\n for the asset.\n\n Args:\n asset_key (AssetKey): The asset to check partitions for.\n partitions (Optional[Sequence[str]]): A list of partitions to check. If not provided,\n all partitions for the asset will be checked.\n\n Returns:\n bool: True if all selected partitions have been materialized, False otherwise.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n if partitions is not None:\n check.sequence_param(partitions, "partitions", of_type=str)\n if len(partitions) == 0:\n raise DagsterInvalidInvocationError("Must provide at least one partition in list")\n\n materialized_partitions = self.instance.get_materialized_partitions(asset_key)\n if not partitions:\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} not monitored by sensor"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not partitions_def:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} is not partitioned. Cannot check if partitions have"\n " been materialized."\n )\n partitions = partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n\n return all([partition in materialized_partitions for partition in partitions])
\n\n def _get_asset(self, asset_key: AssetKey, fn_name: str) -> AssetsDefinition:\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n repo_def = cast(RepositoryDefinition, self._repository_def)\n repository_assets = repo_def.assets_defs_by_key\n if asset_key in self._assets_by_key:\n asset_def = self._assets_by_key[asset_key]\n if asset_def is None:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} does not have an AssetDefinition in this repository"\n f" (likely because it is a SourceAsset). fn context.{fn_name} can only be"\n " called for assets with AssetDefinitions in the repository."\n )\n else:\n return asset_def\n elif asset_key in repository_assets:\n return repository_assets[asset_key]\n else:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor and does not exist in target jobs"\n )\n\n
[docs] @public\n def get_downstream_partition_keys(\n self, partition_key: str, from_asset_key: AssetKey, to_asset_key: AssetKey\n ) -> Sequence[str]:\n """Converts a partition key from one asset to the corresponding partition key in a downstream\n asset. Uses the existing partition mapping between the upstream asset and the downstream\n asset if it exists, otherwise, uses the default partition mapping.\n\n Args:\n partition_key (str): The partition key to convert.\n from_asset_key (AssetKey): The asset key of the upstream asset, which the provided\n partition key belongs to.\n to_asset_key (AssetKey): The asset key of the downstream asset. The provided partition\n key will be mapped to partitions within this asset.\n\n Returns:\n Sequence[str]: A list of the corresponding downstream partitions in to_asset_key that\n partition_key maps to.\n """\n partition_key = check.str_param(partition_key, "partition_key")\n\n to_asset = self._get_asset(to_asset_key, fn_name="get_downstream_partition_keys")\n from_asset = self._get_asset(from_asset_key, fn_name="get_downstream_partition_keys")\n\n to_partitions_def = to_asset.partitions_def\n\n if not isinstance(to_partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {to_asset_key} is not partitioned. Cannot get partition keys."\n )\n if not isinstance(from_asset.partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {from_asset_key} is not partitioned. Cannot get partition keys."\n )\n\n partition_mapping = to_asset.infer_partition_mapping(\n from_asset_key, from_asset.partitions_def\n )\n downstream_partition_key_subset = (\n partition_mapping.get_downstream_partitions_for_partitions(\n from_asset.partitions_def.empty_subset().with_partition_keys([partition_key]),\n downstream_partitions_def=to_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n return list(downstream_partition_key_subset.get_partition_keys())
\n\n
[docs] @public\n def advance_cursor(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n """Marks the provided materialization records as having been consumed by the sensor.\n\n At the end of the tick, the cursor will be updated to advance past all materializations\n records provided via `advance_cursor`. In the next tick, records that have been consumed\n will no longer be returned.\n\n Passing a partitioned materialization record into this function will mark prior materializations\n with the same asset key and partition as having been consumed.\n\n Args:\n materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]): Mapping of\n AssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\n for the AssetKey will be updated and future calls to fetch asset materialization events\n will not fetch this event again. If None is provided, the cursor for the AssetKey\n will not be updated.\n """\n self._cursor_advance_state_mutation.add_advanced_records(materialization_records_by_key)\n self._cursor_updated = True
\n\n
[docs] @public\n def advance_all_cursors(self):\n """Updates the cursor to the most recent materialization event for all assets monitored by\n the multi_asset_sensor.\n\n Marks all materialization events as consumed by the sensor, including unconsumed events.\n """\n materializations_by_key = self.latest_materialization_records_by_key()\n\n self._cursor_advance_state_mutation.add_advanced_records(materializations_by_key)\n self._cursor_advance_state_mutation.advance_all_cursors_called = True\n self._cursor_updated = True
\n\n @public\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, Optional[AssetsDefinition]]:\n """Mapping[AssetKey, Optional[AssetsDefinition]]: A mapping from AssetKey to the\n AssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\n is not produced within the same code location as this sensor, then the value will be None.\n """\n return self._assets_by_key\n\n @public\n @property\n def asset_keys(self) -> Sequence[AssetKey]:\n """Sequence[AssetKey]: The asset keys which are monitored by this sensor."""\n return self._monitored_asset_keys
\n\n\nclass MultiAssetSensorCursorAdvances:\n _advanced_record_ids_by_key: Dict[AssetKey, Set[int]]\n _partition_key_by_record_id: Dict[int, Optional[str]]\n advance_all_cursors_called: bool\n\n def __init__(self):\n self._advanced_record_ids_by_key = defaultdict(set)\n self._partition_key_by_record_id = {}\n self.advance_all_cursors_called = False\n\n def add_advanced_records(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n for asset_key, materialization in materialization_records_by_key.items():\n if materialization:\n self._advanced_record_ids_by_key[asset_key].add(materialization.storage_id)\n\n self._partition_key_by_record_id[materialization.storage_id] = (\n materialization.partition_key\n )\n\n def get_cursor_with_advances(\n self,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> Optional[str]:\n """Given the multi asset sensor context and the cursor at the start of the tick,\n returns the cursor that should be used in the next tick.\n\n If the cursor has not been updated, returns None\n """\n if len(self._advanced_record_ids_by_key) == 0:\n # No events marked as advanced\n return None\n\n return json.dumps(\n {\n str(asset_key): self.get_asset_cursor_with_advances(\n asset_key, context, initial_cursor\n )\n for asset_key in context.asset_keys\n }\n )\n\n def get_asset_cursor_with_advances(\n self,\n asset_key: AssetKey,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> MultiAssetSensorAssetCursorComponent:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n advanced_records: Set[int] = self._advanced_record_ids_by_key.get(asset_key, set())\n if len(advanced_records) == 0:\n # No events marked as advanced for this asset key\n return initial_cursor.get_cursor_for_asset(asset_key)\n\n initial_asset_cursor = initial_cursor.get_cursor_for_asset(asset_key)\n\n latest_consumed_event_id_at_tick_start = initial_asset_cursor.latest_consumed_event_id\n\n greatest_consumed_event_id_in_tick = max(advanced_records)\n latest_consumed_partition_in_tick = self._partition_key_by_record_id[\n greatest_consumed_event_id_in_tick\n ]\n latest_unconsumed_record_by_partition: Dict[str, int] = {}\n\n if not self.advance_all_cursors_called:\n latest_unconsumed_record_by_partition = (\n initial_asset_cursor.trailing_unconsumed_partitioned_event_ids\n )\n unconsumed_events = list(context.get_trailing_unconsumed_events(asset_key)) + list(\n context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=latest_consumed_event_id_at_tick_start,\n before_cursor=greatest_consumed_event_id_in_tick,\n ),\n ascending=True,\n )\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else []\n )\n\n # Iterate through events in ascending order, storing the latest unconsumed\n # event for each partition. If an advanced event exists for a partition, clear\n # the prior unconsumed event for that partition.\n for event in unconsumed_events:\n partition = event.partition_key\n if partition is not None: # Ignore unpartitioned events\n if event.storage_id not in advanced_records:\n latest_unconsumed_record_by_partition[partition] = event.storage_id\n elif partition in latest_unconsumed_record_by_partition:\n latest_unconsumed_record_by_partition.pop(partition)\n\n if (\n latest_consumed_partition_in_tick is not None\n and latest_consumed_partition_in_tick in latest_unconsumed_record_by_partition\n ):\n latest_unconsumed_record_by_partition.pop(latest_consumed_partition_in_tick)\n\n if len(latest_unconsumed_record_by_partition.keys()) >= MAX_NUM_UNCONSUMED_EVENTS:\n raise DagsterInvariantViolationError(f"""\n You have reached the maximum number of trailing unconsumed events\n ({MAX_NUM_UNCONSUMED_EVENTS}) for asset {asset_key} and no more events can be\n added. You can access the unconsumed events by calling the\n `get_trailing_unconsumed_events` method on the sensor context, and\n mark events as consumed by passing them to `advance_cursor`.\n\n Otherwise, you can clear all unconsumed events and reset the cursor to the latest\n materialization for each asset by calling `advance_all_cursors`.\n """)\n\n return MultiAssetSensorAssetCursorComponent(\n latest_consumed_event_partition=(\n latest_consumed_partition_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else initial_asset_cursor.latest_consumed_event_partition\n ),\n latest_consumed_event_id=(\n greatest_consumed_event_id_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else latest_consumed_event_id_at_tick_start\n ),\n trailing_unconsumed_partitioned_event_ids=latest_unconsumed_record_by_partition,\n )\n\n\ndef get_cursor_from_latest_materializations(\n asset_keys: Sequence[AssetKey], instance: DagsterInstance\n) -> str:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n cursor_dict: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n for asset_key in asset_keys:\n materializations = instance.get_event_records(\n EventRecordsFilter(\n DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n ),\n limit=1,\n )\n if materializations:\n last_materialization = list(materializations)[-1]\n\n cursor_dict[str(asset_key)] = MultiAssetSensorAssetCursorComponent(\n last_materialization.partition_key,\n last_materialization.storage_id,\n {},\n )\n\n cursor_str = json.dumps(cursor_dict)\n return cursor_str\n\n\n
[docs]@experimental\ndef build_multi_asset_sensor_context(\n *,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n cursor_from_latest_materializations: bool = False,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n) -> MultiAssetSensorEvaluationContext:\n """Builds multi asset sensor execution context for testing purposes using the provided parameters.\n\n This function can be used to provide a context to the invocation of a multi asset sensor definition. If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (RepositoryDefinition): `RepositoryDefinition` object that\n the sensor is defined in. Must provide `definitions` if this is not provided.\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A string cursor to provide to the evaluation of the sensor. Must be\n a dictionary of asset key strings to ints that has been converted to a json string\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n cursor_from_latest_materializations (bool): If True, the cursor will be set to the latest\n materialization for each monitored asset. By default, set to False.\n resources (Optional[Mapping[str, object]]): The resource definitions\n to provide to the sensor.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n Must provide `repository_def` if this is not provided.\n\n Examples:\n .. code-block:: python\n\n with instance_for_test() as instance:\n context = build_multi_asset_sensor_context(\n monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n instance=instance,\n )\n my_asset_sensor(context)\n\n """\n from dagster._core.definitions import RepositoryDefinition\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n\n check.bool_param(cursor_from_latest_materializations, "cursor_from_latest_materializations")\n\n if cursor_from_latest_materializations:\n if cursor:\n raise DagsterInvalidInvocationError(\n "Cannot provide both cursor and cursor_from_latest_materializations objects."\n " Dagster will override the provided cursor based on the"\n " cursor_from_latest_materializations object."\n )\n if not instance:\n raise DagsterInvalidInvocationError(\n "Cannot provide cursor_from_latest_materializations object without a Dagster"\n " instance."\n )\n\n asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n asset_keys = cast(\n List[AssetKey],\n list(\n monitored_assets.resolve(list(set(repository_def.assets_defs_by_key.values())))\n ),\n )\n else:\n asset_keys = monitored_assets\n\n cursor = get_cursor_from_latest_materializations(asset_keys, instance)\n\n return MultiAssetSensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n monitored_assets=monitored_assets,\n repository_def=repository_def,\n resource_defs=wrap_resources_for_execution(resources),\n )
\n\n\nAssetMaterializationFunctionReturn = Union[\n Iterator[Union[RunRequest, SkipReason, SensorResult]],\n Sequence[RunRequest],\n RunRequest,\n SkipReason,\n None,\n SensorResult,\n]\nAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\nMultiAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\n\n
[docs]@experimental\nclass MultiAssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a list of\n assets.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorDefinition`, use :py:func:`dagster.\n multi_asset_sensor`.\n\n Args:\n name (str): The name of the sensor to create.\n asset_keys (Sequence[AssetKey]): The asset_keys this sensor monitors.\n asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def __init__(\n self,\n name: str,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n job_name: Optional[str],\n asset_materialization_fn: MultiAssetMaterializationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n def _check_cursor_not_set(sensor_result: SensorResult):\n if sensor_result.cursor:\n raise DagsterInvariantViolationError(\n "Cannot set cursor in a multi_asset_sensor. Cursor is set automatically"\n " based on the latest materialization for each monitored asset."\n )\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n with MultiAssetSensorEvaluationContext(\n instance_ref=context.instance_ref,\n last_completion_time=context.last_completion_time,\n last_run_key=context.last_run_key,\n cursor=context.cursor,\n repository_name=context.repository_def.name,\n repository_def=context.repository_def,\n monitored_assets=monitored_assets,\n instance=context.instance,\n resource_defs=context.resource_defs,\n ) as multi_asset_sensor_context:\n context_param_name = get_context_param_name(materialization_fn)\n context_param = (\n {context_param_name: multi_asset_sensor_context}\n if context_param_name\n else {}\n )\n result = materialization_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is None:\n return\n\n # because the materialization_fn can yield results (see _wrapped_fn in multi_asset_sensor decorator),\n # even if you return None in a sensor, it will still cause in inspect.isgenerator(result) to be True.\n # So keep track to see if we actually return any values and should update the cursor\n runs_yielded = False\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n if isinstance(item, RunRequest):\n runs_yielded = True\n if isinstance(item, SensorResult):\n raise DagsterInvariantViolationError(\n "Cannot yield a SensorResult from a multi_asset_sensor. Instead"\n " return the SensorResult."\n )\n yield item\n elif isinstance(result, RunRequest):\n runs_yielded = True\n yield result\n elif isinstance(result, SkipReason):\n # if result is a SkipReason, we don't update the cursor, so don't set runs_yielded = True\n yield result\n elif isinstance(result, SensorResult):\n _check_cursor_not_set(result)\n if result.run_requests:\n runs_yielded = True\n yield result\n\n if runs_yielded and not multi_asset_sensor_context.cursor_updated:\n raise DagsterInvalidDefinitionError(\n "Asset materializations have been handled in this sensor, but the cursor"\n " was not updated. This means the same materialization events will be"\n " handled in the next sensor tick. Use context.advance_cursor or"\n " context.advance_all_cursors to update the cursor."\n )\n\n multi_asset_sensor_context.update_cursor_after_evaluation()\n context.update_cursor(multi_asset_sensor_context.cursor)\n\n return _fn\n\n self._raw_asset_materialization_fn = asset_materialization_fn\n\n super(MultiAssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn")\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=request_assets,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> AssetMaterializationFunctionReturn:\n context_param_name = get_context_param_name(self._raw_asset_materialization_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._raw_asset_materialization_fn,\n args,\n kwargs,\n context_type=MultiAssetSensorEvaluationContext,\n )\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n context_param = {context_param_name: context} if context_param_name and context else {}\n result = self._raw_asset_materialization_fn(**context_param, **resources)\n\n if context:\n context.update_cursor_after_evaluation()\n return result\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.MULTI_ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_asset_sensor_definition"}, "multi_dimensional_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_dimensional_partitions

\nimport hashlib\nimport itertools\nfrom datetime import datetime\nfrom functools import lru_cache, reduce\nfrom typing import (\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.tags import (\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    get_multidimensional_partition_tag,\n)\n\nfrom .partition import (\n    DefaultPartitionsSubset,\n    DynamicPartitionsDefinition,\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\nINVALID_STATIC_PARTITIONS_KEY_CHARACTERS = set(["|", ",", "[", "]"])\n\nMULTIPARTITION_KEY_DELIMITER = "|"\n\n\nclass PartitionDimensionKey(\n    NamedTuple("_PartitionDimensionKey", [("dimension_name", str), ("partition_key", str)])\n):\n    """Representation of a single dimension of a multi-dimensional partition key."""\n\n    def __new__(cls, dimension_name: str, partition_key: str):\n        return super(PartitionDimensionKey, cls).__new__(\n            cls,\n            dimension_name=check.str_param(dimension_name, "dimension_name"),\n            partition_key=check.str_param(partition_key, "partition_key"),\n        )\n\n\n
[docs]class MultiPartitionKey(str):\n """A multi-dimensional partition key stores the partition key for each dimension.\n Subclasses the string class to keep partition key type as a string.\n\n Contains additional methods to access the partition key for each dimension.\n Creates a string representation of the partition key for each dimension, separated by a pipe (|).\n Orders the dimensions by name, to ensure consistent string representation.\n """\n\n dimension_keys: List[PartitionDimensionKey] = []\n\n def __new__(cls, keys_by_dimension: Mapping[str, str]):\n check.mapping_param(\n keys_by_dimension, "partitions_by_dimension", key_type=str, value_type=str\n )\n\n dimension_keys: List[PartitionDimensionKey] = [\n PartitionDimensionKey(dimension, keys_by_dimension[dimension])\n for dimension in sorted(list(keys_by_dimension.keys()))\n ]\n\n str_key = super(MultiPartitionKey, cls).__new__(\n cls,\n MULTIPARTITION_KEY_DELIMITER.join(\n [dim_key.partition_key for dim_key in dimension_keys]\n ),\n )\n\n str_key.dimension_keys = dimension_keys\n\n return str_key\n\n def __getnewargs__(self):\n # When this instance is pickled, replace the argument to __new__ with the\n # dimension key mapping instead of the string representation.\n return ({dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys},)\n\n @property\n def keys_by_dimension(self) -> Mapping[str, str]:\n return {dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys}
\n\n\nclass PartitionDimensionDefinition(\n NamedTuple(\n "_PartitionDimensionDefinition",\n [\n ("name", str),\n ("partitions_def", PartitionsDefinition),\n ],\n )\n):\n def __new__(\n cls,\n name: str,\n partitions_def: PartitionsDefinition,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n partitions_def=check.inst_param(partitions_def, "partitions_def", PartitionsDefinition),\n )\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, PartitionDimensionDefinition)\n and self.name == other.name\n and self.partitions_def == other.partitions_def\n )\n\n\nALLOWED_PARTITION_DIMENSION_TYPES = (\n StaticPartitionsDefinition,\n TimeWindowPartitionsDefinition,\n DynamicPartitionsDefinition,\n)\n\n\ndef _check_valid_partitions_dimensions(\n partitions_dimensions: Mapping[str, PartitionsDefinition]\n) -> None:\n for dim_name, partitions_def in partitions_dimensions.items():\n if not any(isinstance(partitions_def, t) for t in ALLOWED_PARTITION_DIMENSION_TYPES):\n raise DagsterInvalidDefinitionError(\n f"Invalid partitions definition type {type(partitions_def)}. "\n "Only the following partitions definition types are supported: "\n f"{ALLOWED_PARTITION_DIMENSION_TYPES}."\n )\n if isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name is None:\n raise DagsterInvalidDefinitionError(\n "DynamicPartitionsDefinition must have a name to be used in a"\n " MultiPartitionsDefinition."\n )\n\n if isinstance(partitions_def, StaticPartitionsDefinition):\n if any(\n [\n INVALID_STATIC_PARTITIONS_KEY_CHARACTERS & set(key)\n for key in partitions_def.get_partition_keys()\n ]\n ):\n raise DagsterInvalidDefinitionError(\n f"Invalid character in partition key for dimension {dim_name}. "\n "A multi-partitions definition cannot contain partition keys with "\n "the following characters: |, [, ], ,"\n )\n\n\n
[docs]class MultiPartitionsDefinition(PartitionsDefinition[MultiPartitionKey]):\n """Takes the cross-product of partitions from two partitions definitions.\n\n For example, with a static partitions definition where the partitions are ["a", "b", "c"]\n and a daily partitions definition, this partitions definition will have the following\n partitions:\n\n 2020-01-01|a\n 2020-01-01|b\n 2020-01-01|c\n 2020-01-02|a\n 2020-01-02|b\n ...\n\n Args:\n partitions_defs (Mapping[str, PartitionsDefinition]):\n A mapping of dimension name to partitions definition. The total set of partitions will\n be the cross-product of the partitions from each PartitionsDefinition.\n\n Attributes:\n partitions_defs (Sequence[PartitionDimensionDefinition]):\n A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\n name and a PartitionsDefinition. The total set of partitions will be the cross-product\n of the partitions from each PartitionsDefinition. This sequence is ordered by\n dimension name, to ensure consistent ordering of the partitions.\n """\n\n def __init__(self, partitions_defs: Mapping[str, PartitionsDefinition]):\n if not len(partitions_defs.keys()) == 2:\n raise DagsterInvalidInvocationError(\n "Dagster currently only supports multi-partitions definitions with 2 partitions"\n " definitions. Your multi-partitions definition has"\n f" {len(partitions_defs.keys())} partitions definitions."\n )\n check.mapping_param(\n partitions_defs, "partitions_defs", key_type=str, value_type=PartitionsDefinition\n )\n\n _check_valid_partitions_dimensions(partitions_defs)\n\n self._partitions_defs: List[PartitionDimensionDefinition] = sorted(\n [\n PartitionDimensionDefinition(name, partitions_def)\n for name, partitions_def in partitions_defs.items()\n ],\n key=lambda x: x.name,\n )\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return MultiPartitionsSubset\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n str(\n {\n dim_def.name: dim_def.partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n )\n for dim_def in self.partitions_defs\n }\n ).encode("utf-8")\n ).hexdigest()\n\n @property\n def partition_dimension_names(self) -> List[str]:\n return [dim_def.name for dim_def in self._partitions_defs]\n\n @property\n def partitions_defs(self) -> Sequence[PartitionDimensionDefinition]:\n return self._partitions_defs\n\n def get_partitions_def_for_dimension(self, dimension_name: str) -> PartitionsDefinition:\n for dim_def in self._partitions_defs:\n if dim_def.name == dimension_name:\n return dim_def.partitions_def\n check.failed(f"Invalid dimension name {dimension_name}")\n\n # We override the default implementation of `has_partition_key` for performance.\n def has_partition_key(\n self,\n partition_key: Union[MultiPartitionKey, str],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n partition_key = (\n partition_key\n if isinstance(partition_key, MultiPartitionKey)\n else self.get_partition_key_from_str(partition_key)\n )\n if partition_key.keys_by_dimension.keys() != set(self.partition_dimension_names):\n raise DagsterUnknownPartitionError(\n f"Invalid partition key {partition_key}. The dimensions of the partition key are"\n " not the dimensions of the partitions definition."\n )\n\n for dimension in self.partitions_defs:\n if not dimension.partitions_def.has_partition_key(\n partition_key.keys_by_dimension[dimension.name],\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ):\n return False\n return True\n\n # store results for repeated calls with the same current_time\n @lru_cache(maxsize=1)\n def _get_partition_keys(\n self, current_time: datetime, dynamic_partitions_store: Optional[DynamicPartitionsStore]\n ) -> Sequence[MultiPartitionKey]:\n partition_key_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in self._partitions_defs\n ]\n\n return [\n MultiPartitionKey(\n {self._partitions_defs[i].name: key for i, key in enumerate(partition_key_tuple)}\n )\n for partition_key_tuple in itertools.product(*partition_key_sequences)\n ]\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[MultiPartitionKey]:\n """Returns a list of MultiPartitionKeys representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partition dimensions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when a\n dimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\n DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[MultiPartitionKey]\n """\n return self._get_partition_keys(\n current_time or pendulum.now("UTC"), dynamic_partitions_store\n )
\n\n def filter_valid_partition_keys(\n self, partition_keys: Set[str], dynamic_partitions_store: DynamicPartitionsStore\n ) -> Set[MultiPartitionKey]:\n partition_keys_by_dimension = {\n dim.name: dim.partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n }\n validated_partitions = set()\n for partition_key in partition_keys:\n partition_key_strs = partition_key.split(MULTIPARTITION_KEY_DELIMITER)\n if len(partition_key_strs) != len(self.partitions_defs):\n continue\n\n multipartition_key = MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n if all(\n key in partition_keys_by_dimension.get(dim, [])\n for dim, key in multipartition_key.keys_by_dimension.items()\n ):\n validated_partitions.add(partition_key)\n\n return validated_partitions\n\n def __eq__(self, other):\n return (\n isinstance(other, MultiPartitionsDefinition)\n and self.partitions_defs == other.partitions_defs\n )\n\n def __hash__(self):\n return hash(\n tuple(\n [\n (partitions_def.name, partitions_def.__repr__())\n for partitions_def in self.partitions_defs\n ]\n )\n )\n\n def __str__(self) -> str:\n dimension_1 = self._partitions_defs[0]\n dimension_2 = self._partitions_defs[1]\n partition_str = (\n "Multi-partitioned, with dimensions: \\n"\n f"{dimension_1.name.capitalize()}: {dimension_1.partitions_def} \\n"\n f"{dimension_2.name.capitalize()}: {dimension_2.partitions_def}"\n )\n return partition_str\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(dimensions={[str(dim) for dim in self.partitions_defs]}"\n\n def get_partition_key_from_str(self, partition_key_str: str) -> MultiPartitionKey:\n """Given a string representation of a partition key, returns a MultiPartitionKey object."""\n check.str_param(partition_key_str, "partition_key_str")\n\n partition_key_strs = partition_key_str.split(MULTIPARTITION_KEY_DELIMITER)\n check.invariant(\n len(partition_key_strs) == len(self.partitions_defs),\n f"Expected {len(self.partitions_defs)} partition keys in partition key string"\n f" {partition_key_str}, but got {len(partition_key_strs)}",\n )\n\n return MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n def _get_primary_and_secondary_dimension(\n self,\n ) -> Tuple[PartitionDimensionDefinition, PartitionDimensionDefinition]:\n # Multipartitions subsets are serialized by primary dimension. If changing\n # the selection of primary/secondary dimension, will need to also update the\n # serialization of MultiPartitionsSubsets\n\n time_dimensions = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_dimensions) == 1:\n primary_dimension, secondary_dimension = time_dimensions[0], next(\n iter([dim for dim in self.partitions_defs if dim != time_dimensions[0]])\n )\n else:\n primary_dimension, secondary_dimension = (\n self.partitions_defs[0],\n self.partitions_defs[1],\n )\n\n return primary_dimension, secondary_dimension\n\n @property\n def primary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[0]\n\n @property\n def secondary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[1]\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n partition_key = cast(MultiPartitionKey, self.get_partition_key_from_str(partition_key))\n tags = {**super().get_tags_for_partition_key(partition_key)}\n tags.update(get_tags_from_multi_partition_key(partition_key))\n return tags\n\n @property\n def time_window_dimension(self) -> PartitionDimensionDefinition:\n time_window_dims = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n check.invariant(\n len(time_window_dims) == 1, "Expected exactly one time window partitioned dimension"\n )\n return next(iter(time_window_dims))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n if not isinstance(partition_key, MultiPartitionKey):\n partition_key = self.get_partition_key_from_str(partition_key)\n\n time_window_dimension = self.time_window_dimension\n return cast(\n TimeWindowPartitionsDefinition, time_window_dimension.partitions_def\n ).time_window_for_partition_key(\n cast(MultiPartitionKey, partition_key).keys_by_dimension[time_window_dimension.name]\n )\n\n def get_multipartition_keys_with_dimension_value(\n self,\n dimension_name: str,\n dimension_partition_key: str,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Sequence[MultiPartitionKey]:\n check.str_param(dimension_name, "dimension_name")\n check.str_param(dimension_partition_key, "dimension_partition_key")\n\n matching_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name == dimension_name\n ]\n other_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name != dimension_name\n ]\n\n check.invariant(\n len(matching_dimensions) == 1,\n f"Dimension {dimension_name} not found in MultiPartitionsDefinition with dimensions"\n f" {[dim.name for dim in self.partitions_defs]}",\n )\n\n partition_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in other_dimensions\n ] + [[dimension_partition_key]]\n\n # Names of partitions dimensions in the same order as partition_sequences\n partition_dim_names = [dim.name for dim in other_dimensions] + [dimension_name]\n\n return [\n MultiPartitionKey(\n {\n partition_dim_names[i]: partition_key\n for i, partition_key in enumerate(partitions_tuple)\n }\n )\n for partitions_tuple in itertools.product(*partition_sequences)\n ]\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Static partitions definitions can contain duplicate keys (will throw error in 1.3.0)\n # In the meantime, relying on get_num_partitions to handle duplicates to display\n # correct counts in the Dagster UI.\n dimension_counts = [\n dim.partitions_def.get_num_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n ]\n return reduce(lambda x, y: x * y, dimension_counts, 1)
\n\n\nclass MultiPartitionsSubset(DefaultPartitionsSubset):\n def __init__(\n self,\n partitions_def: MultiPartitionsDefinition,\n subset: Optional[Set[str]] = None,\n ):\n check.inst_param(partitions_def, "partitions_def", MultiPartitionsDefinition)\n subset = (\n set(\n [\n partitions_def.get_partition_key_from_str(key)\n for key in subset\n if MULTIPARTITION_KEY_DELIMITER in key\n ]\n )\n if subset\n else set()\n )\n super(MultiPartitionsSubset, self).__init__(partitions_def, subset)\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "MultiPartitionsSubset":\n return MultiPartitionsSubset(\n cast(MultiPartitionsDefinition, self._partitions_def),\n self._subset | set(partition_keys),\n )\n\n\ndef get_tags_from_multi_partition_key(multi_partition_key: MultiPartitionKey) -> Mapping[str, str]:\n check.inst_param(multi_partition_key, "multi_partition_key", MultiPartitionKey)\n\n return {\n get_multidimensional_partition_tag(dimension.dimension_name): dimension.partition_key\n for dimension in multi_partition_key.dimension_keys\n }\n\n\ndef get_multipartition_key_from_tags(tags: Mapping[str, str]) -> str:\n partitions_by_dimension: Dict[str, str] = {}\n for tag in tags:\n if tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX):\n dimension = tag[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]\n partitions_by_dimension[dimension] = tags[tag]\n\n return MultiPartitionKey(partitions_by_dimension)\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_dimensional_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_dimensional_partitions"}, "op_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.op_definition

\nimport inspect\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.dependency import NodeHandle, NodeInputHandle\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    InputManagerRequirement,\n    OpDefinitionResourceRequirement,\n    OutputManagerRequirement,\n    ResourceRequirement,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .hook_definition import HookDefinition\nfrom .inference import infer_output_props\nfrom .input import In, InputDefinition\nfrom .output import Out, OutputDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_layer import AssetLayer\n\n    from .composition import PendingNodeInvocation\n    from .decorators.op_decorator import DecoratedOpFunction\n\nOpComputeFunction: TypeAlias = Callable[..., Any]\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead."\n)\nclass OpDefinition(NodeDefinition, IHasInternalInit):\n """Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the op. If set,\n this is used as a default code version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n ins={"num": In(int)},\n outs={"result": Out(int)},\n compute_fn=_add_one,\n )\n """\n\n _compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"]\n _config_schema: IDefinitionConfigSchema\n _required_resource_keys: AbstractSet[str]\n _version: Optional[str]\n _retry_policy: Optional[RetryPolicy]\n\n def __init__(\n self,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n ):\n from .decorators.op_decorator import DecoratedOpFunction, resolve_checked_op_fn_inputs\n\n ins = check.opt_mapping_param(ins, "ins")\n input_defs = [\n inp.to_definition(name) for name, inp in sorted(ins.items(), key=lambda inp: inp[0])\n ] # sort so that input definition order is deterministic\n\n if isinstance(compute_fn, DecoratedOpFunction):\n resolved_input_defs: Sequence[InputDefinition] = resolve_checked_op_fn_inputs(\n decorator_name="@op",\n fn_name=name,\n compute_fn=cast(DecoratedOpFunction, compute_fn),\n explicit_input_defs=input_defs,\n exclude_nothing=True,\n )\n self._compute_fn = compute_fn\n _validate_context_type_hint(self._compute_fn.decorated_fn)\n else:\n resolved_input_defs = input_defs\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n _validate_context_type_hint(self._compute_fn)\n\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n self._version = code_version\n\n check.opt_mapping_param(outs, "outs")\n output_defs = _resolve_output_defs_from_outs(\n compute_fn=compute_fn, outs=outs, default_code_version=code_version\n )\n\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedOpFunction)\n else None\n )\n\n super(OpDefinition, self).__init__(\n name=name,\n input_defs=check.sequence_param(resolved_input_defs, "input_defs", InputDefinition),\n output_defs=check.sequence_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def dagster_internal_init(\n *,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]],\n outs: Optional[Mapping[str, Out]],\n description: Optional[str],\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]],\n required_resource_keys: Optional[AbstractSet[str]],\n tags: Optional[Mapping[str, Any]],\n version: Optional[str],\n retry_policy: Optional[RetryPolicy],\n code_version: Optional[str],\n ) -> "OpDefinition":\n return OpDefinition(\n compute_fn=compute_fn,\n name=name,\n ins=ins,\n outs=outs,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n code_version=code_version,\n )\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this op."""\n return super(OpDefinition, self).name\n\n @public\n @property\n def ins(self) -> Mapping[str, In]:\n """Mapping[str, In]: A mapping from input name to the In object that represents that input."""\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @public\n @property\n def outs(self) -> Mapping[str, Out]:\n """Mapping[str, Out]: A mapping from output name to the Out object that represents that output."""\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedOpFunction"]:\n return self._compute_fn\n\n @public\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n """IDefinitionConfigSchema: The config schema for this op."""\n return self._config_schema\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """AbstractSet[str]: A set of keys for resources that must be provided to this OpDefinition."""\n return frozenset(self._required_resource_keys)\n\n @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use `code_version` instead.")\n @property\n def version(self) -> Optional[str]:\n """str: Version of the code encapsulated by the op. If set, this is used as a\n default code version for all outputs.\n """\n return self._version\n\n @public\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n """Optional[RetryPolicy]: The RetryPolicy for this op."""\n return self._retry_policy\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for this op."""\n return super(OpDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given name."""\n return super(OpDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given tags."""\n return super(OpDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given hook definitions."""\n return super(OpDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given retry policy."""\n return super(OpDefinition, self).with_retry_policy(retry_policy)
\n\n def is_from_decorator(self) -> bool:\n from .decorators.op_decorator import DecoratedOpFunction\n\n return isinstance(self._compute_fn, DecoratedOpFunction)\n\n def get_output_annotation(self) -> Any:\n if not self.is_from_decorator():\n raise DagsterInvalidInvocationError(\n f"Attempted to get output annotation for {self.node_type_str} '{self.name}', "\n "which was not constructed from a decorated function."\n )\n return cast("DecoratedOpFunction", self.compute_fn).get_output_annotation()\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n yield self\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: T_Handle\n ) -> Tuple[OutputDefinition, T_Handle]:\n return self.output_def_named(output_name), handle\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n return self\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n handle = cast(NodeHandle, check.inst_param(handle, "handle", NodeHandle))\n unresolveable_input_defs = []\n for input_def in self.input_defs:\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.has_default_value\n and not input_def.input_manager_key\n ):\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n # If input_asset_key is present, this input can be resolved\n # by a source asset, so input does not need to be resolved\n # at the top level.\n if input_asset_key:\n continue\n unresolveable_input_defs.append(input_def)\n return unresolveable_input_defs\n\n def input_has_default(self, input_name: str) -> bool:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def with_replaced_properties(\n self,\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[IDefinitionConfigSchema] = None,\n description: Optional[str] = None,\n ) -> "OpDefinition":\n return OpDefinition.dagster_internal_init(\n name=name,\n ins=ins\n or {input_def.name: In.from_definition(input_def) for input_def in self.input_defs},\n outs=outs\n or {\n output_def.name: Out.from_definition(output_def) for output_def in self.output_defs\n },\n compute_fn=self.compute_fn,\n config_schema=config_schema or self.config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n code_version=self._version,\n retry_policy=self.retry_policy,\n version=None, # code_version replaces version\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n ) -> "OpDefinition":\n return self.with_replaced_properties(\n name=name,\n description=description,\n config_schema=config_schema,\n )\n\n def get_resource_requirements(\n self,\n outer_context: Optional[object] = None,\n ) -> Iterator[ResourceRequirement]:\n # Outer requiree in this context is the outer-calling node handle. If not provided, then\n # just use the op name.\n outer_context = cast(Optional[Tuple[NodeHandle, Optional["AssetLayer"]]], outer_context)\n if not outer_context:\n handle = None\n asset_layer = None\n else:\n handle, asset_layer = outer_context\n node_description = f"{self.node_type_str} '{handle or self.name}'"\n for resource_key in sorted(list(self.required_resource_keys)):\n yield OpDefinitionResourceRequirement(\n key=resource_key, node_description=node_description\n )\n for input_def in self.input_defs:\n if input_def.input_manager_key:\n yield InputManagerRequirement(\n key=input_def.input_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n elif asset_layer and handle:\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n if input_asset_key:\n io_manager_key = asset_layer.io_manager_key_for_asset(input_asset_key)\n yield InputManagerRequirement(\n key=io_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n\n for output_def in self.output_defs:\n yield OutputManagerRequirement(\n key=output_def.io_manager_key,\n node_description=node_description,\n output_name=output_def.name,\n )\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n return [input_handle]\n\n def __call__(self, *args, **kwargs) -> Any:\n from .composition import is_in_composition\n\n if is_in_composition():\n return super(OpDefinition, self).__call__(*args, **kwargs)\n\n return direct_invocation_result(self, *args, **kwargs)
\n\n\ndef _resolve_output_defs_from_outs(\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n outs: Optional[Mapping[str, Out]],\n default_code_version: Optional[str],\n) -> Sequence[OutputDefinition]:\n from .decorators.op_decorator import DecoratedOpFunction\n\n if isinstance(compute_fn, DecoratedOpFunction):\n inferred_output_props = infer_output_props(compute_fn.decorated_fn)\n annotation = inferred_output_props.annotation\n description = inferred_output_props.description\n else:\n inferred_output_props = None\n annotation = inspect.Parameter.empty\n description = None\n\n if outs is None:\n return [OutputDefinition.create_from_inferred(inferred_output_props, default_code_version)]\n\n # If only a single entry has been provided to the out dict, then slurp the\n # annotation into the entry.\n if len(outs) == 1:\n name = next(iter(outs.keys()))\n only_out = outs[name]\n return [only_out.to_definition(annotation, name, description, default_code_version)]\n\n output_defs: List[OutputDefinition] = []\n\n # Introspection on type annotations is experimental, so checking\n # metaclass is the best we can do.\n if annotation != inspect.Parameter.empty and not get_origin(annotation) == tuple:\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n )\n if annotation != inspect.Parameter.empty and not len(get_args(annotation)) == len(outs):\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation to have number of entries matching the "\n f"number of outputs for more than one output. Expected {len(outs)} "\n f"outputs but annotation has {len(get_args(annotation))}."\n )\n for idx, (name, cur_out) in enumerate(outs.items()):\n annotation_type = (\n get_args(annotation)[idx]\n if annotation != inspect.Parameter.empty\n else inspect.Parameter.empty\n )\n # Don't provide description when using multiple outputs. Introspection\n # is challenging when faced with multiple inputs.\n output_defs.append(\n cur_out.to_definition(\n annotation_type, name=name, description=None, code_version=default_code_version\n )\n )\n\n return output_defs\n\n\ndef _validate_context_type_hint(fn):\n from inspect import _empty as EmptyAnnotation\n\n from dagster._core.decorator_utils import get_function_params\n from dagster._core.definitions.decorators.op_decorator import is_context_provided\n from dagster._core.execution.context.compute import AssetExecutionContext, OpExecutionContext\n\n params = get_function_params(fn)\n if is_context_provided(params):\n if (\n params[0].annotation is not AssetExecutionContext\n and params[0].annotation is not OpExecutionContext\n and params[0].annotation is not EmptyAnnotation\n ):\n raise DagsterInvalidDefinitionError(\n f"Cannot annotate `context` parameter with type {params[0].annotation}. `context`"\n " must be annotated with AssetExecutionContext, OpExecutionContext, or left blank."\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/op_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.output

\nimport inspect\nfrom typing import (\n    Any,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataUserInput,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    is_dynamic_output_annotation,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, check_valid_name\n\nTOutputDefinition = TypeVar("TOutputDefinition", bound="OutputDefinition")\nTOut = TypeVar("TOut", bound="Out")\n\n\nclass OutputDefinition:\n    """Defines an output from an op's compute function.\n\n    Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n    Many ops have only one output, in which case the user can provide a single output definition\n    that will be given the default name, "result".\n\n    Output definitions may be typed using the Dagster type system.\n\n    Args:\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n            Users should provide the Python type of the objects that they expect the op to yield\n            for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n            want to be run on this output. Defaults to :py:class:`Any`.\n        name (Optional[str]): Name of the output. (default: "result")\n        description (Optional[str]): Human-readable description of the output.\n        is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n        io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n            output and loading it in downstream steps (default: "io_manager").\n        metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n            For example, users can provide a file path if the data object will be stored in a\n            filesystem, or provide information of a database table when it is going to load the data\n            into the table.\n        code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n            general, versions should be set only for code that deterministically produces the same\n            output when given the same inputs.\n\n    """\n\n    def __init__(\n        self,\n        dagster_type=None,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        is_required: bool = True,\n        io_manager_key: Optional[str] = None,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        code_version: Optional[str] = None,\n        # make sure new parameters are updated in combine_with_inferred below\n    ):\n        self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n        self._type_not_set = dagster_type is None\n        self._dagster_type = resolve_dagster_type(dagster_type)\n        self._description = check.opt_str_param(description, "description")\n        self._is_required = check.bool_param(is_required, "is_required")\n        self._io_manager_key = check.opt_str_param(\n            io_manager_key,\n            "io_manager_key",\n            default=DEFAULT_IO_MANAGER_KEY,\n        )\n        self._code_version = check.opt_str_param(code_version, "code_version")\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def is_required(self) -> bool:\n        return self._is_required\n\n    @property\n    def io_manager_key(self) -> str:\n        return self._io_manager_key\n\n    @property\n    def code_version(self) -> Optional[str]:\n        return self._code_version\n\n    @property\n    def optional(self) -> bool:\n        return not self.is_required\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_dynamic(self) -> bool:\n        return False\n\n    def mapping_from(\n        self, node_name: str, output_name: Optional[str] = None, from_dynamic_mapping: bool = False\n    ) -> "OutputMapping":\n        """Create an output mapping from an output of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`OutputMapping` from the output of a child node.\n\n        Args:\n            node_name (str): The name of the child node from which to map this output.\n            output_name (str): The name of the child node's output from which to map this output.\n\n        Examples:\n            .. code-block:: python\n\n                output_mapping = OutputDefinition(Int).mapping_from('child_node')\n        """\n        return OutputMapping(\n            graph_output_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_output_name=output_name or DEFAULT_OUTPUT,\n            graph_output_description=self.description,\n            dagster_type=self.dagster_type,\n            from_dynamic_mapping=from_dynamic_mapping or self.is_dynamic,\n        )\n\n    @staticmethod\n    def create_from_inferred(\n        inferred: Optional[InferredOutputProps], code_version: Optional[str] = None\n    ) -> "OutputDefinition":\n        if not inferred:\n            return OutputDefinition(code_version=code_version)\n        if is_dynamic_output_annotation(inferred.annotation):\n            return DynamicOutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n        else:\n            return OutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n\n    def combine_with_inferred(\n        self: TOutputDefinition, inferred: InferredOutputProps\n    ) -> TOutputDefinition:\n        dagster_type = self.dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred.annotation)\n        if self.description is None:\n            description = inferred.description\n        else:\n            description = self.description\n\n        return self.__class__(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            is_required=self.is_required,\n            io_manager_key=self.io_manager_key,\n            metadata=self._metadata,\n        )\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n    try:\n        if inferred == inspect.Parameter.empty:\n            return resolve_dagster_type(None)\n        elif inferred is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            return resolve_dagster_type(type(None))\n        else:\n            return resolve_dagster_type(inferred)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred}' from return type annotation, correct the issue "\n            "or explicitly set the dagster_type via Out()."\n        ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n    """Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n    output that will dynamically alter the graph at runtime.\n\n    When using in a composition function such as :py:func:`@job <dagster.job>`,\n    dynamic outputs must be used with either:\n\n    * ``map`` - clone downstream nodes for each separate :py:class:`DynamicOutput`\n    * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n    Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n        .. code-block:: python\n\n            @op(\n                config_schema={\n                    "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n                },\n                output_defs=[DynamicOutputDefinition(str)],\n            )\n            def files_in_directory(context):\n                path = context.op_config["path"]\n                dirname, _, filenames = next(os.walk(path))\n                for file in filenames:\n                    yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n            @job\n            def process_directory():\n                files = files_in_directory()\n\n                # use map to invoke an op on each dynamic output\n                file_results = files.map(process_file)\n\n                # use collect to gather the results in to a list\n                summarize_directory(file_results.collect())\n    """\n\n    @property\n    def is_dynamic(self) -> bool:\n        return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("node_name", str), ("output_name", str)])):\n    def __new__(cls, node_name: str, output_name: Optional[str] = None):\n        return super(OutputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the underlying op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass OutputMapping(NamedTuple):\n """Defines an output mapping for a graph.\n\n Args:\n graph_output_name (str): Name of the output in the graph being mapped to.\n mapped_node_name (str): Named of the node (op/graph) that the output is being mapped from.\n mapped_node_output_name (str): Name of the output in the node (op/graph) that is being mapped from.\n graph_output_description (Optional[str]): A description of the output in the graph being mapped from.\n from_dynamic_mapping (bool): Set to true if the node being mapped to is a mapped dynamic node.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's output being mapped to.\n\n Examples:\n .. code-block:: python\n\n from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n @op\n def emit_five(x):\n return 5\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[emit_five],\n output_mappings=[\n OutputMapping(\n graph_output_name="result", # Default output name\n mapped_node_name="emit_five",\n mapped_node_output_name="result"\n )\n ]\n )\n\n @graph(out=GraphOut())\n def the_graph:\n return emit_five()\n """\n\n graph_output_name: str\n mapped_node_name: str\n mapped_node_output_name: str\n graph_output_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n from_dynamic_mapping: bool = False\n\n @property\n def maps_from(self) -> OutputPointer:\n return OutputPointer(self.mapped_node_name, self.mapped_node_output_name)\n\n def get_definition(self, is_dynamic: bool) -> "OutputDefinition":\n check.invariant(not is_dynamic or self.from_dynamic_mapping)\n is_dynamic = is_dynamic or self.from_dynamic_mapping\n klass = DynamicOutputDefinition if is_dynamic else OutputDefinition\n return klass(\n name=self.graph_output_name,\n description=self.graph_output_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("io_manager_key", PublicAttr[str]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("code_version", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n code_version: Optional[str] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n return super(Out, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=metadata,\n code_version=code_version,\n )\n\n @classmethod\n def from_definition(cls, output_def: "OutputDefinition"):\n klass = Out if not output_def.is_dynamic else DynamicOut\n return klass(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n code_version=output_def.code_version,\n )\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n klass = OutputDefinition if not self.is_dynamic else DynamicOutputDefinition\n\n return klass(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return False
\n\n\n
[docs]class DynamicOut(Out):\n """Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return True
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", PublicAttr[Optional[str]])])):\n """Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.output"}, "partition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition

\nimport copy\nimport hashlib\nimport json\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import (\n    datetime,\n    timedelta,\n)\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dateutil.relativedelta import relativedelta\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, public\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.run_request import (\n    AddDynamicPartitionsRequest,\n    DeleteDynamicPartitionsRequest,\n)\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import xor\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import (\n    normalize_renamed_param,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom .config import ConfigMapping\nfrom .utils import validate_tags\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT_cov = TypeVar("T_cov", default=Any, covariant=True)\nT_str = TypeVar("T_str", bound=str, default=str, covariant=True)\nT_PartitionsDefinition = TypeVar(\n    "T_PartitionsDefinition",\n    bound="PartitionsDefinition",\n    default="PartitionsDefinition",\n    covariant=True,\n)\n\n# In the Dagster UI users can select partition ranges following the format '2022-01-13...2022-01-14'\n# "..." is an invalid substring in partition keys\n# The other escape characters are characters that may not display in the Dagster UI.\nINVALID_PARTITION_SUBSTRINGS = ["...", "\\a", "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", "\\0"]\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use string partition keys instead.")\nclass Partition(Generic[T_cov]):\n    """A Partition represents a single slice of the entire set of a job's possible work. It consists\n    of a value, which is an object that represents that partition, and an optional name, which is\n    used to label the partition in a human-readable way.\n\n    Args:\n        value (Any): The object for this partition\n        name (str): Name for this partition\n    """\n\n    def __init__(self, value: Any, name: Optional[str] = None):\n        self._value = value\n        self._name = check.str_param(name or str(value), "name")\n\n    @property\n    def value(self) -> T_cov:\n        return self._value\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, Partition):\n            return False\n        else:\n            return self.value == other.value and self.name == other.name\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n    HOURLY = "HOURLY"\n    DAILY = "DAILY"\n    WEEKLY = "WEEKLY"\n    MONTHLY = "MONTHLY"\n\n    @property\n    def ordinal(self):\n        return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n    @property\n    def delta(self):\n        if self == ScheduleType.HOURLY:\n            return timedelta(hours=1)\n        elif self == ScheduleType.DAILY:\n            return timedelta(days=1)\n        elif self == ScheduleType.WEEKLY:\n            return timedelta(weeks=1)\n        elif self == ScheduleType.MONTHLY:\n            return relativedelta(months=1)\n        else:\n            check.failed(f"Unexpected ScheduleType {self}")\n\n    def __gt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal > other.ordinal\n\n    def __lt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal < other.ordinal\n\n\n
[docs]class PartitionsDefinition(ABC, Generic[T_str]):\n """Defines a set of partitions, which can be attached to a software-defined asset or job.\n\n Abstract class with implementations for different kinds of partitions.\n """\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset[T_str]"]:\n return DefaultPartitionsSubset[T_str]\n\n
[docs] @abstractmethod\n @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n ...
\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[-1] if partition_keys else None\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[0] if partition_keys else None\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n keys_exist = {\n partition_key_range.start: self.has_partition_key(\n partition_key_range.start, dynamic_partitions_store=dynamic_partitions_store\n ),\n partition_key_range.end: self.has_partition_key(\n partition_key_range.end, dynamic_partitions_store=dynamic_partitions_store\n ),\n }\n if not all(keys_exist.values()):\n raise DagsterInvalidInvocationError(\n f"""Partition range {partition_key_range.start} to {partition_key_range.end} is\n not a valid range. Nonexistent partition keys:\n {list(key for key in keys_exist if keys_exist[key] is False)}"""\n )\n\n # in the simple case, simply return the single key in the range\n if partition_key_range.start == partition_key_range.end:\n return [cast(T_str, partition_key_range.start)]\n\n # defer this call as it is potentially expensive\n partition_keys = self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n return partition_keys[\n partition_keys.index(partition_key_range.start) : partition_keys.index(\n partition_key_range.end\n )\n + 1\n ]\n\n def empty_subset(self) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.empty_subset(self)\n\n def subset_with_partition_keys(\n self, partition_keys: Iterable[str]\n ) -> "PartitionsSubset[T_str]":\n return self.empty_subset().with_partition_keys(partition_keys)\n\n def subset_with_all_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.subset_with_partition_keys(\n self.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def deserialize_subset(self, serialized: str) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.from_serialized(self, serialized)\n\n def can_deserialize_subset(\n self,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n return self.partitions_subset_class.can_deserialize(\n self,\n serialized,\n serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name,\n )\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n json.dumps(\n self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n ).encode("utf-8")\n ).hexdigest()\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n tags = {PARTITION_NAME_TAG: partition_key}\n return tags\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n return len(self.get_partition_keys(current_time, dynamic_partitions_store))\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return partition_key in self.get_partition_keys(\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n def validate_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> None:\n if not self.has_partition_key(partition_key, current_time, dynamic_partitions_store):\n raise DagsterUnknownPartitionError(\n f"Could not find a partition with key `{partition_key}`."\n )
\n\n\ndef raise_error_on_invalid_partition_key_substring(partition_keys: Sequence[str]) -> None:\n for partition_key in partition_keys:\n found_invalid_substrs = [\n invalid_substr\n for invalid_substr in INVALID_PARTITION_SUBSTRINGS\n if invalid_substr in partition_key\n ]\n if found_invalid_substrs:\n raise DagsterInvalidDefinitionError(\n f"{found_invalid_substrs} are invalid substrings in a partition key"\n )\n\n\ndef raise_error_on_duplicate_partition_keys(partition_keys: Sequence[str]) -> None:\n counts: Dict[str, int] = defaultdict(lambda: 0)\n for partition_key in partition_keys:\n counts[partition_key] += 1\n found_duplicates = [key for key in counts.keys() if counts[key] > 1]\n if found_duplicates:\n raise DagsterInvalidDefinitionError(\n "Partition keys must be unique. Duplicate instances of partition keys:"\n f" {found_duplicates}."\n )\n\n\n
[docs]class StaticPartitionsDefinition(PartitionsDefinition[str]):\n """A statically-defined set of partitions.\n\n Example:\n .. code-block:: python\n\n from dagster import StaticPartitionsDefinition, asset\n\n oceans_partitions_def = StaticPartitionsDefinition(\n ["arctic", "atlantic", "indian", "pacific", "southern"]\n )\n\n @asset(partitions_def=oceans_partitions_defs)\n def ml_model_for_each_ocean():\n ...\n """\n\n def __init__(self, partition_keys: Sequence[str]):\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n\n raise_error_on_invalid_partition_key_substring(partition_keys)\n raise_error_on_duplicate_partition_keys(partition_keys)\n\n self._partition_keys = partition_keys\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Only applicable to\n DynamicPartitionsDefinitions.\n\n Returns:\n Sequence[str]\n\n """\n return self._partition_keys
\n\n def __hash__(self):\n return hash(self.__repr__())\n\n def __eq__(self, other) -> bool:\n return isinstance(other, StaticPartitionsDefinition) and (\n self is other or self._partition_keys == other.get_partition_keys()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={self._partition_keys})"\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # We don't currently throw an error when a duplicate partition key is defined\n # in a static partitions definition, though we will at 1.3.0.\n # This ensures that partition counts are correct in the Dagster UI.\n return len(set(self.get_partition_keys(current_time, dynamic_partitions_store)))
\n\n\nclass CachingDynamicPartitionsLoader(DynamicPartitionsStore):\n """A batch loader that caches the partition keys for a given dynamic partitions definition,\n to avoid repeated calls to the database for the same partitions definition.\n """\n\n def __init__(self, instance: DagsterInstance):\n self._instance = instance\n\n @cached_method\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n return self._instance.get_dynamic_partitions(partitions_def_name)\n\n @cached_method\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n return self._instance.has_dynamic_partition(partitions_def_name, partition_key)\n\n\n
[docs]@deprecated_param(\n param="partition_fn",\n breaking_version="2.0",\n additional_warn_text="Provide partition definition name instead.",\n)\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [\n (\n "partition_fn",\n PublicAttr[\n Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ]\n ],\n ),\n ("name", PublicAttr[Optional[str]]),\n ],\n ),\n):\n """A partitions definition whose partition keys can be dynamically added and removed.\n\n This is useful for cases where the set of partitions is not known at definition time,\n but is instead determined at runtime.\n\n Partitions can be added and removed using `instance.add_dynamic_partitions` and\n `instance.delete_dynamic_partition` methods.\n\n Args:\n name (Optional[str]): The name of the partitions definition.\n partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]):\n A function that returns the current set of partitions. This argument is deprecated and\n will be removed in 2.0.0.\n\n Examples:\n .. code-block:: python\n\n fruits = DynamicPartitionsDefinition(name="fruits")\n\n @sensor(job=my_job)\n def my_sensor(context):\n return SensorResult(\n run_requests=[RunRequest(partition_key="apple")],\n dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n )\n """\n\n def __new__(\n cls,\n partition_fn: Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ] = None,\n name: Optional[str] = None,\n ):\n partition_fn = check.opt_callable_param(partition_fn, "partition_fn")\n name = check.opt_str_param(name, "name")\n\n if partition_fn is None and name is None:\n raise DagsterInvalidDefinitionError(\n "Must provide either partition_fn or name to DynamicPartitionsDefinition."\n )\n\n if partition_fn and name:\n raise DagsterInvalidDefinitionError(\n "Cannot provide both partition_fn and name to DynamicPartitionsDefinition."\n )\n\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls,\n partition_fn=check.opt_callable_param(partition_fn, "partition_fn"),\n name=check.opt_str_param(name, "name"),\n )\n\n def _validated_name(self) -> str:\n if self.name is None:\n check.failed(\n "Dynamic partitions definition must have a name to fetch dynamic partitions"\n )\n return self.name\n\n def __eq__(self, other):\n return (\n isinstance(other, DynamicPartitionsDefinition)\n and self.name == other.name\n and self.partition_fn == other.partition_fn\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n def __str__(self) -> str:\n if self.name:\n return f'Dynamic partitions: "{self._validated_name()}"'\n else:\n return super().__str__()\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n if self.partition_fn:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return [partition.name for partition in partitions] # type: ignore # (illegible conditional)\n else:\n return partitions # type: ignore # (illegible conditional)\n else:\n check.opt_inst_param(\n dynamic_partitions_store, "dynamic_partitions_store", DynamicPartitionsStore\n )\n\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=self._validated_name()\n )
\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n if self.partition_fn:\n return partition_key in self.get_partition_keys(current_time)\n else:\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.has_dynamic_partition(\n partitions_def_name=self._validated_name(), partition_key=partition_key\n )\n\n def build_add_request(self, partition_keys: Sequence[str]) -> AddDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return AddDynamicPartitionsRequest(validated_name, partition_keys)\n\n def build_delete_request(self, partition_keys: Sequence[str]) -> DeleteDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return DeleteDynamicPartitionsRequest(validated_name, partition_keys)
\n\n\n
[docs]@deprecated_param(\n param="run_config_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n)\n@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `tags_for_partition_key_fn` instead.",\n)\nclass PartitionedConfig(Generic[T_PartitionsDefinition]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: T_PartitionsDefinition,\n run_config_for_partition_fn: Optional[Callable[[Partition], Mapping[str, Any]]] = None,\n decorated_fn: Optional[Callable[..., Mapping[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[Any]], Mapping[str, str]]] = None,\n run_config_for_partition_key_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._decorated_fn = decorated_fn\n\n check.invariant(\n xor(run_config_for_partition_fn, run_config_for_partition_key_fn),\n "Must provide exactly one of run_config_for_partition_fn or"\n " run_config_for_partition_key_fn",\n )\n check.invariant(\n not (tags_for_partition_fn and tags_for_partition_key_fn),\n "Cannot provide both of tags_for_partition_fn or tags_for_partition_key_fn",\n )\n\n self._run_config_for_partition_fn = check.opt_callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._run_config_for_partition_key_fn = check.opt_callable_param(\n run_config_for_partition_key_fn, "run_config_for_partition_key_fn"\n )\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n self._tags_for_partition_key_fn = check.opt_callable_param(\n tags_for_partition_key_fn, "tags_for_partition_key_fn"\n )\n\n @public\n @property\n def partitions_def(\n self,\n ) -> T_PartitionsDefinition:\n """T_PartitionsDefinition: The partitions definition associated with this PartitionedConfig."""\n return self._partitions\n\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n )\n @public\n @property\n def run_config_for_partition_fn(\n self,\n ) -> Optional[Callable[[Partition], Mapping[str, Any]]]:\n """Optional[Callable[[Partition], Mapping[str, Any]]]: A function that accepts a partition\n and returns a dictionary representing the config to attach to runs for that partition.\n Deprecated as of 1.3.3.\n """\n return self._run_config_for_partition_fn\n\n @public\n @property\n def run_config_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, Any]]]:\n """Optional[Callable[[str], Mapping[str, Any]]]: A function that accepts a partition key\n and returns a dictionary representing the config to attach to runs for that partition.\n """\n\n @deprecated(\n breaking_version="2.0", additional_warn_text="Use `tags_for_partition_key_fn` instead."\n )\n @public\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition], Mapping[str, str]]]:\n """Optional[Callable[[Partition], Mapping[str, str]]]: A function that\n accepts a partition and returns a dictionary of tags to attach to runs for\n that partition. Deprecated as of 1.3.3.\n """\n return self._tags_for_partition_fn\n\n @public\n @property\n def tags_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, str]]]:\n """Optional[Callable[[str], Mapping[str, str]]]: A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for\n that partition.\n """\n return self._tags_for_partition_key_fn\n\n
[docs] @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Sequence[str]:\n """Returns a list of partition keys, representing the full set of partitions that\n config can be applied to.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time. Only\n applicable to time-based partitions definitions.\n\n Returns:\n Sequence[str]\n """\n return self.partitions_def.get_partition_keys(current_time)
\n\n # Assumes partition key already validated\n def get_run_config_for_partition_key(\n self,\n partition_key: str,\n ) -> Mapping[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n # _run_config_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._run_config_for_partition_fn:\n run_config = self._run_config_for_partition_fn(Partition(partition_key))\n elif self._run_config_for_partition_key_fn:\n run_config = self._run_config_for_partition_key_fn(partition_key)\n else:\n check.failed("Unreachable.") # one of the above funcs always defined\n return copy.deepcopy(run_config)\n\n # Assumes partition key already validated\n def get_tags_for_partition_key(\n self,\n partition_key: str,\n job_name: Optional[str] = None,\n ) -> Mapping[str, str]:\n from dagster._core.host_representation.external_data import (\n external_partition_set_name_for_job_name,\n )\n\n # _tags_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._tags_for_partition_fn:\n user_tags = self._tags_for_partition_fn(Partition(partition_key))\n elif self._tags_for_partition_key_fn:\n user_tags = self._tags_for_partition_key_fn(partition_key)\n else:\n user_tags = {}\n user_tags = validate_tags(user_tags, allow_reserved_tags=False)\n\n system_tags = {\n **self.partitions_def.get_tags_for_partition_key(partition_key),\n **(\n # `PartitionSetDefinition` has been deleted but we still need to attach this special tag in\n # order for reexecution against partitions to work properly.\n {PARTITION_SET_TAG: external_partition_set_name_for_job_name(job_name)}\n if job_name\n else {}\n ),\n }\n\n return {**user_tags, **system_tags}\n\n @classmethod\n def from_flexible_config(\n cls,\n config: Optional[Union[ConfigMapping, Mapping[str, object], "PartitionedConfig"]],\n partitions_def: PartitionsDefinition,\n ) -> "PartitionedConfig":\n check.invariant(\n not isinstance(config, ConfigMapping),\n "Can't supply a ConfigMapping for 'config' when 'partitions_def' is supplied.",\n )\n\n if isinstance(config, PartitionedConfig):\n check.invariant(\n config.partitions_def == partitions_def,\n "Can't supply a PartitionedConfig for 'config' with a different "\n "PartitionsDefinition than supplied for 'partitions_def'.",\n )\n return config\n else:\n hardcoded_config = config if config else {}\n return cls(\n partitions_def,\n run_config_for_partition_key_fn=lambda _: cast(Mapping, hardcoded_config),\n )\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef static_partitioned_config(\n partition_keys: Sequence[str],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig[StaticPartitionsDefinition]]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys is a static list of strings identifying the set of partitions. The\n list of partitions is static, so while the run config returned by the decorated function may\n change over time, the list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in the Dagster UI.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (Sequence[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.sequence_param(partition_keys, "partition_keys", str)\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(\n fn: Callable[[str], Mapping[str, Any]]\n ) -> PartitionedConfig[StaticPartitionsDefinition]:\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef partitioned_config(\n partitions_def: PartitionsDefinition,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a partitioned config for a job given a PartitionsDefinition.\n\n The partitions_def provides the set of partitions, which may change over time\n (for example, when using a DynamicPartitionsDefinition).\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partitions_def: (Optional[DynamicPartitionsDefinition]): PartitionsDefinition for the job\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.opt_callable_param(tags_for_partition_key_fn, "tags_for_partition_key_fn")\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=partitions_def,\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], Sequence[str]],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef cron_schedule_from_schedule_type_and_offsets(\n schedule_type: ScheduleType,\n minute_offset: int,\n hour_offset: int,\n day_offset: Optional[int],\n) -> str:\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute_offset} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute_offset} {hour_offset} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute_offset} {hour_offset} * * {day_offset if day_offset is not None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute_offset} {hour_offset} {day_offset if day_offset is not None else 1} * *"\n else:\n check.assert_never(schedule_type)\n\n\nclass PartitionsSubset(ABC, Generic[T_str]):\n """Represents a subset of the partitions within a PartitionsDefinition."""\n\n @abstractmethod\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[T_str]: ...\n\n @abstractmethod\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[T_str]: ...\n\n @abstractmethod\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]: ...\n\n @abstractmethod\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset[T_str]": ...\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.with_partition_keys(\n self.partitions_def.get_partition_keys_in_range(\n partition_key_range, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def __or__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.with_partition_keys(other.get_partition_keys())\n\n def __sub__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self.partitions_def.empty_subset()\n return self.partitions_def.empty_subset().with_partition_keys(\n set(self.get_partition_keys()).difference(set(other.get_partition_keys()))\n )\n\n def __and__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.partitions_def.empty_subset().with_partition_keys(\n set(self.get_partition_keys()) & set(other.get_partition_keys())\n )\n\n @abstractmethod\n def serialize(self) -> str: ...\n\n @classmethod\n @abstractmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]": ...\n\n @classmethod\n @abstractmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool: ...\n\n @property\n @abstractmethod\n def partitions_def(self) -> PartitionsDefinition[T_str]: ...\n\n @abstractmethod\n def __len__(self) -> int: ...\n\n @abstractmethod\n def __contains__(self, value) -> bool: ...\n\n @classmethod\n @abstractmethod\n def empty_subset(\n cls, partitions_def: PartitionsDefinition[T_str]\n ) -> "PartitionsSubset[T_str]": ...\n\n\n@whitelist_for_serdes\nclass SerializedPartitionsSubset(NamedTuple):\n serialized_subset: str\n serialized_partitions_def_unique_id: str\n serialized_partitions_def_class_name: str\n\n @classmethod\n def from_subset(\n cls,\n subset: PartitionsSubset,\n partitions_def: PartitionsDefinition,\n dynamic_partitions_store: DynamicPartitionsStore,\n ):\n return cls(\n serialized_subset=subset.serialize(),\n serialized_partitions_def_unique_id=partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n ),\n serialized_partitions_def_class_name=partitions_def.__class__.__name__,\n )\n\n def can_deserialize(self, partitions_def: Optional[PartitionsDefinition]) -> bool:\n if not partitions_def:\n # Asset had a partitions definition at storage time, but no longer does\n return False\n\n return partitions_def.can_deserialize_subset(\n self.serialized_subset,\n serialized_partitions_def_unique_id=self.serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name=self.serialized_partitions_def_class_name,\n )\n\n def deserialize(self, partitions_def: PartitionsDefinition) -> PartitionsSubset:\n return partitions_def.deserialize_subset(self.serialized_subset)\n\n\nclass DefaultPartitionsSubset(PartitionsSubset[T_str]):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self, partitions_def: PartitionsDefinition[T_str], subset: Optional[Set[T_str]] = None\n ):\n check.opt_set_param(subset, "subset")\n self._partitions_def = partitions_def\n self._subset = subset or set()\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n return (\n set(\n self._partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n - self._subset\n )\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n return self._subset\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n partition_keys = self._partitions_def.get_partition_keys(\n current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n cur_range_start = None\n cur_range_end = None\n result = []\n for partition_key in partition_keys:\n if partition_key in self._subset:\n if cur_range_start is None:\n cur_range_start = partition_key\n cur_range_end = partition_key\n else:\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n cur_range_start = cur_range_end = None\n\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n\n return result\n\n def with_partition_keys(\n self, partition_keys: Iterable[T_str]\n ) -> "DefaultPartitionsSubset[T_str]":\n return DefaultPartitionsSubset(\n self._partitions_def,\n self._subset | set(partition_keys),\n )\n\n def serialize(self) -> str:\n # Serialize version number, so attempting to deserialize old versions can be handled gracefully.\n # Any time the serialization format changes, we should increment the version number.\n return json.dumps({"version": self.SERIALIZATION_VERSION, "subset": list(self._subset)})\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]":\n # Check the version number, so only valid versions can be deserialized.\n data = json.loads(serialized)\n\n if isinstance(data, list):\n # backwards compatibility\n return cls(subset=set(data), partitions_def=partitions_def)\n else:\n if data.get("version") != cls.SERIALIZATION_VERSION:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {data.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n return cls(subset=set(data.get("subset")), partitions_def=partitions_def)\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition[T_str],\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_class_name is not None:\n return serialized_partitions_def_class_name == partitions_def.__class__.__name__\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n data.get("subset") is not None and data.get("version") == cls.SERIALIZATION_VERSION\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T_str]:\n return self._partitions_def\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DefaultPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and self._subset == other._subset\n )\n\n def __len__(self) -> int:\n return len(self._subset)\n\n def __contains__(self, value) -> bool:\n return value in self._subset\n\n def __repr__(self) -> str:\n return (\n f"DefaultPartitionsSubset(subset={self._subset}, partitions_def={self._partitions_def})"\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition[T_str]) -> "PartitionsSubset[T_str]":\n return cls(partitions_def=partitions_def)\n
", "current_page_name": "_modules/dagster/_core/definitions/partition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition"}, "partition_key_range": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_key_range

\nfrom typing import NamedTuple\n\nfrom dagster._annotations import PublicAttr\n\n\n
[docs]class PartitionKeyRange(NamedTuple):\n """Defines a range of partitions.\n\n Attributes:\n start (str): The starting partition key in the range (inclusive).\n end (str): The ending partition key in the range (inclusive).\n\n Examples:\n .. code-block:: python\n\n partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\n partition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n """\n\n # Inclusive on both sides\n start: PublicAttr[str]\n end: PublicAttr[str]
\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_key_range", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_key_range"}, "partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_mapping

\nimport collections.abc\nimport itertools\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Collection,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._core.definitions.multi_dimensional_partitions import (\n    MultiPartitionKey,\n    MultiPartitionsDefinition,\n)\nfrom dagster._core.definitions.partition import (\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n\nclass UpstreamPartitionsResult(NamedTuple):\n    """Represents the result of mapping a PartitionsSubset to the corresponding\n    partitions in another PartitionsDefinition.\n\n    partitions_subset (PartitionsSubset): The resulting partitions subset that was\n        mapped to. Only contains partitions for existent partitions, filtering out nonexistent partitions.\n    required_but_nonexistent_partition_keys (Sequence[str]): A list containing invalid partition keys in to_partitions_def\n        that partitions in from_partitions_subset were mapped to.\n    """\n\n    partitions_subset: PartitionsSubset\n    required_but_nonexistent_partition_keys: Sequence[str]\n\n\n
[docs]class PartitionMapping(ABC):\n """Defines a correspondence between the partitions in an asset and the partitions in an asset\n that it depends on.\n\n Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\n class may change at any time.\n """\n\n
[docs] @public\n @abstractmethod\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the subset of partition keys in the downstream asset that use the data in the given\n partition key subset of the upstream asset.\n\n Args:\n upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]): The\n subset of partition keys in the upstream asset.\n downstream_partitions_def (PartitionsDefinition): The partitions definition for the\n downstream asset.\n """
\n\n
[docs] @public\n @abstractmethod\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n """Returns a UpstreamPartitionsResult object containing the partition keys the downstream\n partitions subset was mapped to in the upstream partitions definition.\n\n Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\n Invalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.\n\n For example, if an upstream asset is time-partitioned and starts in June 2023, and the\n downstream asset is time-partitioned and starts in May 2023, this function would return a\n UpstreamPartitionsResult(PartitionsSubset("2023-06-01"), required_but_nonexistent_partition_keys=["2023-05-01"])\n when downstream_partitions_subset contains 2023-05-01 and 2023-06-01.\n """
\n\n\n
[docs]@whitelist_for_serdes\nclass IdentityPartitionMapping(PartitionMapping, NamedTuple("_IdentityPartitionMapping", [])):\n """Expects that the upstream and downstream assets are partitioned in the same way, and maps\n partitions in the downstream asset to the same partition in the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n if downstream_partitions_subset.partitions_def == upstream_partitions_def:\n return UpstreamPartitionsResult(downstream_partitions_subset, [])\n\n upstream_partition_keys = set(\n upstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n downstream_partition_keys = set(downstream_partitions_subset.get_partition_keys())\n\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(\n list(upstream_partition_keys & downstream_partition_keys)\n ),\n list(downstream_partition_keys - upstream_partition_keys),\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n if upstream_partitions_subset.partitions_def == downstream_partitions_def:\n return upstream_partitions_subset\n\n upstream_partition_keys = set(upstream_partitions_subset.get_partition_keys())\n downstream_partition_keys = set(\n downstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n return downstream_partitions_def.empty_subset().with_partition_keys(\n list(downstream_partition_keys & upstream_partition_keys)\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AllPartitionMapping(PartitionMapping, NamedTuple("_AllPartitionMapping", [])):\n """Maps every partition in the downstream asset to every partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on all partitions of the usptream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n upstream_subset = upstream_partitions_def.subset_with_all_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass LastPartitionMapping(PartitionMapping, NamedTuple("_LastPartitionMapping", [])):\n """Maps all dependencies to the last partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on the last partition of the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n last = upstream_partitions_def.get_last_partition_key(\n current_time=None, dynamic_partitions_store=dynamic_partitions_store\n )\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if last is not None:\n upstream_subset = upstream_subset.with_partition_keys([last])\n\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass SpecificPartitionsPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_SpecificPartitionsPartitionMapping", [("partition_keys", PublicAttr[Sequence[str]])]\n ),\n):\n """Maps to a specific subset of partitions in the upstream asset.\n\n Example:\n .. code-block:: python\n\n from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\n def upstream():\n ...\n\n @asset(\n ins={\n "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n }\n )\n def a_downstream(upstream):\n ...\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(self.partition_keys), []\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n # if any of the partition keys in this partition mapping are contained within the upstream\n # partitions subset, then all partitions of the downstream asset are dependencies\n if any(key in upstream_partitions_subset for key in self.partition_keys):\n return downstream_partitions_def.subset_with_all_partitions(\n dynamic_partitions_store=dynamic_partitions_store\n )\n return downstream_partitions_def.empty_subset()
\n\n\nclass DimensionDependency(NamedTuple):\n partition_mapping: PartitionMapping\n upstream_dimension_name: Optional[str] = None\n downstream_dimension_name: Optional[str] = None\n\n\nclass BaseMultiPartitionMapping(ABC):\n @abstractmethod\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]: ...\n\n def get_partitions_def(\n self, partitions_def: PartitionsDefinition, dimension_name: Optional[str]\n ) -> PartitionsDefinition:\n if isinstance(partitions_def, MultiPartitionsDefinition):\n if not isinstance(dimension_name, str):\n check.failed("Expected dimension_name to be a string")\n return partitions_def.get_partitions_def_for_dimension(dimension_name)\n return partitions_def\n\n def _get_dependency_partitions_subset(\n self,\n a_partitions_def: PartitionsDefinition,\n a_partitions_subset: PartitionsSubset,\n b_partitions_def: PartitionsDefinition,\n a_upstream_of_b: bool,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Union[UpstreamPartitionsResult, PartitionsSubset]:\n """Given two partitions definitions a_partitions_def and b_partitions_def that have a dependency\n relationship (a_upstream_of_b is True if a_partitions_def is upstream of b_partitions_def),\n and a_partition_keys, a list of partition keys in a_partitions_def, returns a list of\n partition keys in the partitions definition b_partitions_def that are\n dependencies of the partition keys in a_partition_keys.\n """\n a_partition_keys_by_dimension = defaultdict(set)\n if isinstance(a_partitions_def, MultiPartitionsDefinition):\n for partition_key in a_partitions_subset.get_partition_keys():\n for dimension_name, key in cast(\n MultiPartitionKey, partition_key\n ).keys_by_dimension.items():\n a_partition_keys_by_dimension[dimension_name].add(key)\n else:\n for partition_key in a_partitions_subset.get_partition_keys():\n a_partition_keys_by_dimension[None].add(partition_key)\n\n # Maps the dimension name and key of a partition in a_partitions_def to the list of\n # partition keys in b_partitions_def that are dependencies of that partition\n dep_b_keys_by_a_dim_and_key: Dict[Optional[str], Dict[Optional[str], List[str]]] = (\n defaultdict(lambda: defaultdict(list))\n )\n required_but_nonexistent_upstream_partitions = set()\n\n b_dimension_partitions_def_by_name: Dict[Optional[str], PartitionsDefinition] = (\n {\n dimension.name: dimension.partitions_def\n for dimension in b_partitions_def.partitions_defs\n }\n if isinstance(b_partitions_def, MultiPartitionsDefinition)\n else {None: b_partitions_def}\n )\n\n if a_upstream_of_b:\n # a_partitions_def is upstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependent dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.upstream_dimension_name: (\n dimension_mapping.downstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n a_partitions_def, b_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n dimension_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n # if downstream dimension mapping exists, for a given key, get the list of\n # downstream partition keys that are dependencies of that key\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n dimension_mapping.get_downstream_partitions_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ).get_partition_keys()\n )\n\n else:\n # a_partitions_def is downstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependency dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.downstream_dimension_name: (\n dimension_mapping.upstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n b_partitions_def, a_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n partition_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n )\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n mapped_partitions_result.partitions_subset.get_partition_keys()\n )\n required_but_nonexistent_upstream_partitions.update(\n set(mapped_partitions_result.required_but_nonexistent_partition_keys)\n )\n\n b_partition_keys = set()\n\n mapped_a_dim_names = a_dim_to_dependency_b_dim.keys()\n mapped_b_dim_names = [mapping[0] for mapping in a_dim_to_dependency_b_dim.values()]\n unmapped_b_dim_names = list(\n set(b_dimension_partitions_def_by_name.keys()) - set(mapped_b_dim_names)\n )\n\n for key in a_partitions_subset.get_partition_keys():\n for b_key_values in itertools.product(\n *(\n [\n dep_b_keys_by_a_dim_and_key[dim_name][\n (\n cast(MultiPartitionKey, key).keys_by_dimension[dim_name]\n if dim_name\n else key\n )\n ]\n for dim_name in mapped_a_dim_names\n ]\n ),\n *[\n b_dimension_partitions_def_by_name[dim_name].get_partition_keys()\n for dim_name in unmapped_b_dim_names\n ],\n ):\n b_partition_keys.add(\n MultiPartitionKey(\n {\n cast(str, (mapped_b_dim_names + unmapped_b_dim_names)[i]): key\n for i, key in enumerate(b_key_values)\n }\n )\n if len(b_key_values) > 1\n else b_key_values[0]\n )\n\n mapped_subset = b_partitions_def.empty_subset().with_partition_keys(b_partition_keys)\n if a_upstream_of_b:\n return mapped_subset\n else:\n return UpstreamPartitionsResult(\n mapped_subset,\n required_but_nonexistent_partition_keys=list(\n required_but_nonexistent_upstream_partitions\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, downstream_partitions_subset.partitions_def),\n downstream_partitions_subset,\n cast(MultiPartitionsDefinition, upstream_partitions_def),\n a_upstream_of_b=False,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n if not isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected UpstreamPartitionsResult")\n\n return result\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, upstream_partitions_subset.partitions_def),\n upstream_partitions_subset,\n cast(MultiPartitionsDefinition, downstream_partitions_def),\n a_upstream_of_b=True,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n if isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected PartitionsSubset")\n\n return result\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiToSingleDimensionPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiToSingleDimensionPartitionMapping", [("partition_dimension_name", Optional[str])]\n ),\n):\n """Defines a correspondence between an single-dimensional partitions definition\n and a MultiPartitionsDefinition. The single-dimensional partitions definition must be\n a dimension of the MultiPartitionsDefinition.\n\n This class handles the case where the upstream asset is multipartitioned and the\n downstream asset is single dimensional, and vice versa.\n\n For a partition key X, this partition mapping assumes that any multi-partition key with\n X in the selected dimension is a dependency.\n\n Args:\n partition_dimension_name (Optional[str]): The name of the partition dimension in the\n MultiPartitionsDefinition that matches the single-dimension partitions definition.\n """\n\n def __new__(cls, partition_dimension_name: Optional[str] = None):\n return super(MultiToSingleDimensionPartitionMapping, cls).__new__(\n cls,\n partition_dimension_name=check.opt_str_param(\n partition_dimension_name, "partition_dimension_name"\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n infer_mapping_result = _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n )\n\n if not infer_mapping_result.can_infer:\n check.invariant(isinstance(infer_mapping_result.inference_failure_reason, str))\n check.failed(cast(str, infer_mapping_result.inference_failure_reason))\n\n return [cast(DimensionDependency, infer_mapping_result.dimension_dependency)]
\n\n\n@whitelist_for_serdes\nclass DimensionPartitionMapping(\n NamedTuple(\n "_DimensionPartitionMapping",\n [\n ("dimension_name", str),\n ("partition_mapping", PartitionMapping),\n ],\n )\n):\n """A helper class for MultiPartitionMapping that defines a partition mapping used to calculate\n the dependent partition keys in the selected downstream MultiPartitions definition dimension.\n\n Args:\n dimension_name (str): The name of the dimension in the downstream MultiPartitionsDefinition.\n partition_mapping (PartitionMapping): The partition mapping object used to calculate\n the downstream dimension partitions from the upstream dimension partitions and vice versa.\n """\n\n def __new__(\n cls,\n dimension_name: str,\n partition_mapping: PartitionMapping,\n ):\n return super(DimensionPartitionMapping, cls).__new__(\n cls,\n dimension_name=check.str_param(dimension_name, "dimension_name"),\n partition_mapping=check.inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n )\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiPartitionMapping",\n [("downstream_mappings_by_upstream_dimension", Mapping[str, DimensionPartitionMapping])],\n ),\n):\n """Defines a correspondence between two MultiPartitionsDefinitions.\n\n Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\n the explicit correspondence between the upstream and downstream MultiPartitions dimensions\n and the partition mapping used to calculate the downstream partitions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "abc": DimensionPartitionMapping(\n dimension_name="123",\n partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n ),\n "weekly": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=TimeWindowPartitionMapping(),\n )\n }\n )\n\n For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\n assume an `AllPartitionsMapping`, meaning that all upstream partitions in those dimensions\n will be mapped to all downstream partitions in those dimensions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "daily": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=IdentityPartitionMapping(),\n )\n }\n )\n\n # Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n # {"abc": "a", "daily": "2023-01-01"}\n # {"abc": "b", "daily": "2023-01-01"}\n # {"abc": "c", "daily": "2023-01-01"}\n\n Args:\n downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]): A\n mapping that defines an explicit correspondence between one dimension of the upstream\n MultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\n Maps a string representing upstream dimension name to downstream DimensionPartitionMapping,\n containing the downstream dimension name and partition mapping.\n """\n\n def __new__(\n cls, downstream_mappings_by_upstream_dimension: Mapping[str, DimensionPartitionMapping]\n ):\n return super(MultiPartitionMapping, cls).__new__(\n cls,\n downstream_mappings_by_upstream_dimension=check.mapping_param(\n downstream_mappings_by_upstream_dimension,\n "downstream_mappings_by_upstream_dimension",\n key_type=str,\n value_type=DimensionPartitionMapping,\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n self._check_all_dimensions_accounted_for(\n upstream_partitions_def,\n downstream_partitions_def,\n )\n\n return [\n DimensionDependency(\n mapping.partition_mapping,\n upstream_dimension_name=upstream_dimension,\n downstream_dimension_name=mapping.dimension_name,\n )\n for upstream_dimension, mapping in self.downstream_mappings_by_upstream_dimension.items()\n ]\n\n def _check_all_dimensions_accounted_for(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> None:\n if any(\n not isinstance(partitions_def, MultiPartitionsDefinition)\n for partitions_def in (upstream_partitions_def, downstream_partitions_def)\n ):\n check.failed(\n "Both partitions defs provided to a MultiPartitionMapping must be multi-partitioned"\n )\n\n upstream_dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, upstream_partitions_def).partitions_defs\n }\n dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, downstream_partitions_def).partitions_defs\n }\n\n for (\n upstream_dimension_name,\n dimension_mapping,\n ) in self.downstream_mappings_by_upstream_dimension.items():\n if upstream_dimension_name not in upstream_dimension_names:\n check.failed(\n "Dimension mapping has an upstream dimension name that is not in the upstream "\n "partitions def"\n )\n if dimension_mapping.dimension_name not in dimension_names:\n check.failed(\n "Dimension mapping has a downstream dimension name that is not in the"\n " downstream partitions def"\n )\n\n upstream_dimension_names.remove(upstream_dimension_name)\n dimension_names.remove(dimension_mapping.dimension_name)
\n\n\n
[docs]@whitelist_for_serdes\nclass StaticPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_StaticPartitionMapping",\n [\n (\n "downstream_partition_keys_by_upstream_partition_key",\n PublicAttr[Mapping[str, Union[str, Collection[str]]]],\n )\n ],\n ),\n):\n """Define an explicit correspondence between two StaticPartitionsDefinitions.\n\n Args:\n downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]):\n The single or multi-valued correspondence from upstream keys to downstream keys.\n """\n\n def __init__(\n self,\n downstream_partition_keys_by_upstream_partition_key: Mapping[\n str, Union[str, Collection[str]]\n ],\n ):\n check.mapping_param(\n downstream_partition_keys_by_upstream_partition_key,\n "downstream_partition_keys_by_upstream_partition_key",\n key_type=str,\n value_type=(str, collections.abc.Collection),\n )\n\n # cache forward and reverse mappings\n self._mapping = defaultdict(set)\n for (\n upstream_key,\n downstream_keys,\n ) in downstream_partition_keys_by_upstream_partition_key.items():\n self._mapping[upstream_key] = (\n {downstream_keys} if isinstance(downstream_keys, str) else set(downstream_keys)\n )\n\n self._inverse_mapping = defaultdict(set)\n for upstream_key, downstream_keys in self._mapping.items():\n for downstream_key in downstream_keys:\n self._inverse_mapping[downstream_key].add(upstream_key)\n\n @cached_method\n def _check_upstream(self, *, upstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream is only defined on upstream keys."""\n check.inst(\n upstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n upstream_keys = upstream_partitions_def.get_partition_keys()\n extra_keys = set(self._mapping.keys()).difference(upstream_keys)\n if extra_keys:\n raise ValueError(\n f"mapping source partitions not in the upstream partitions definition: {extra_keys}"\n )\n\n @cached_method\n def _check_downstream(self, *, downstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream only maps to downstream keys."""\n check.inst(\n downstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n downstream_keys = downstream_partitions_def.get_partition_keys()\n extra_keys = set(self._inverse_mapping.keys()).difference(downstream_keys)\n if extra_keys:\n raise ValueError(\n "mapping target partitions not in the downstream partitions definition:"\n f" {extra_keys}"\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n self._check_downstream(downstream_partitions_def=downstream_partitions_def)\n\n downstream_subset = downstream_partitions_def.empty_subset()\n downstream_keys = set()\n for key in upstream_partitions_subset.get_partition_keys():\n downstream_keys.update(self._mapping[key])\n return downstream_subset.with_partition_keys(downstream_keys)\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n self._check_upstream(upstream_partitions_def=upstream_partitions_def)\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if downstream_partitions_subset is None:\n return UpstreamPartitionsResult(upstream_subset, [])\n\n upstream_keys = set()\n for key in downstream_partitions_subset.get_partition_keys():\n upstream_keys.update(self._inverse_mapping[key])\n\n return UpstreamPartitionsResult(upstream_subset.with_partition_keys(upstream_keys), [])
\n\n\nclass InferSingleToMultiDimensionDepsResult(\n NamedTuple(\n "_InferSingleToMultiDimensionDepsResult",\n [\n ("can_infer", bool),\n ("inference_failure_reason", Optional[str]),\n ("dimension_dependency", Optional[DimensionDependency]),\n ],\n )\n):\n def __new__(\n cls,\n can_infer: bool,\n inference_failure_reason: Optional[str] = None,\n dimension_dependency: Optional[DimensionDependency] = None,\n ):\n if can_infer and dimension_dependency is None:\n check.failed("dimension_dependency must be provided if can_infer is True")\n if not can_infer and inference_failure_reason is None:\n check.failed("inference_failure_reason must be provided if can_infer is False")\n\n return super(InferSingleToMultiDimensionDepsResult, cls).__new__(\n cls,\n can_infer,\n inference_failure_reason,\n dimension_dependency,\n )\n\n\ndef _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n partition_dimension_name: Optional[str] = None,\n) -> InferSingleToMultiDimensionDepsResult:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n upstream_is_multipartitioned = isinstance(upstream_partitions_def, MultiPartitionsDefinition)\n\n multipartitions_defs = [\n partitions_def\n for partitions_def in [upstream_partitions_def, downstream_partitions_def]\n if isinstance(partitions_def, MultiPartitionsDefinition)\n ]\n if len(multipartitions_defs) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "Can only use MultiToSingleDimensionPartitionMapping when upstream asset is"\n " multipartitioned and the downstream asset is single dimensional, or vice versa."\n f" Instead received {len(multipartitions_defs)} multi-partitioned assets.",\n )\n\n multipartitions_def = cast(MultiPartitionsDefinition, next(iter(multipartitions_defs)))\n\n single_dimension_partitions_def = next(\n iter(\n {\n upstream_partitions_def,\n downstream_partitions_def,\n }\n - set(multipartitions_defs)\n )\n )\n\n filtered_multipartition_dims = (\n multipartitions_def.partitions_defs\n if partition_dimension_name is None\n else [\n dim\n for dim in multipartitions_def.partitions_defs\n if dim.name == partition_dimension_name\n ]\n )\n\n if partition_dimension_name:\n if len(filtered_multipartition_dims) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n f"Provided partition dimension name {partition_dimension_name} not found in"\n f" multipartitions definition {multipartitions_def}.",\n )\n\n matching_dimension_defs = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if dimension_def.partitions_def == single_dimension_partitions_def\n ]\n\n if len(matching_dimension_defs) == 1:\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n IdentityPartitionMapping(),\n upstream_dimension_name=(\n matching_dimension_defs[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n matching_dimension_defs[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n elif len(matching_dimension_defs) > 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "partition dimension name must be specified when multiple dimensions of the"\n " MultiPartitionsDefinition match the single dimension partitions def",\n )\n\n time_dimensions = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if isinstance(dimension_def.partitions_def, TimeWindowPartitionsDefinition)\n ]\n\n if len(time_dimensions) == 1 and isinstance(\n single_dimension_partitions_def, TimeWindowPartitionsDefinition\n ):\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n TimeWindowPartitionMapping(),\n upstream_dimension_name=(\n time_dimensions[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n time_dimensions[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n\n return InferSingleToMultiDimensionDepsResult(\n False,\n "MultiToSingleDimensionPartitionMapping can only be used when: \\n(a) The single dimensional"\n " partitions definition is a dimension of the MultiPartitionsDefinition.\\n(b) The single"\n " dimensional partitions definition is a TimeWindowPartitionsDefinition and the"\n " MultiPartitionsDefinition has a single time dimension.",\n )\n\n\ndef infer_partition_mapping(\n partition_mapping: Optional[PartitionMapping],\n downstream_partitions_def: Optional[PartitionsDefinition],\n upstream_partitions_def: Optional[PartitionsDefinition],\n) -> PartitionMapping:\n from .time_window_partition_mapping import TimeWindowPartitionMapping\n\n if partition_mapping is not None:\n return partition_mapping\n elif upstream_partitions_def and downstream_partitions_def:\n if _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n ).can_infer:\n with disable_dagster_warnings():\n return MultiToSingleDimensionPartitionMapping()\n elif isinstance(upstream_partitions_def, TimeWindowPartitionsDefinition) and isinstance(\n downstream_partitions_def, TimeWindowPartitionsDefinition\n ):\n return TimeWindowPartitionMapping()\n else:\n return IdentityPartitionMapping()\n else:\n return AllPartitionMapping()\n\n\ndef get_builtin_partition_mapping_types() -> Tuple[Type[PartitionMapping], ...]:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n return (\n AllPartitionMapping,\n IdentityPartitionMapping,\n LastPartitionMapping,\n SpecificPartitionsPartitionMapping,\n StaticPartitionMapping,\n TimeWindowPartitionMapping,\n MultiToSingleDimensionPartitionMapping,\n MultiPartitionMapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_mapping"}, "partitioned_schedule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partitioned_schedule

\nfrom typing import Callable, Mapping, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .decorators.schedule_decorator import schedule\nfrom .job_definition import JobDefinition\nfrom .multi_dimensional_partitions import MultiPartitionsDefinition\nfrom .partition import PartitionsDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import (\n    TimeWindowPartitionsDefinition,\n    get_time_partitions_def,\n    has_one_dimension_time_window_partitioning,\n)\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\n\nclass UnresolvedPartitionedAssetScheduleDefinition(NamedTuple):\n    """Points to an unresolved asset job. The asset selection isn't resolved yet, so we can't resolve\n    the PartitionsDefinition, so we can't resolve the schedule cadence.\n    """\n\n    name: str\n    job: UnresolvedAssetJobDefinition\n    description: Optional[str]\n    default_status: DefaultScheduleStatus\n    minute_of_hour: Optional[int]\n    hour_of_day: Optional[int]\n    day_of_week: Optional[int]\n    day_of_month: Optional[int]\n    tags: Optional[Mapping[str, str]]\n\n    def resolve(self, resolved_job: JobDefinition) -> ScheduleDefinition:\n        partitions_def = resolved_job.partitions_def\n        if partitions_def is None:\n            check.failed(\n                f"Job '{resolved_job.name}' provided to build_schedule_from_partitioned_job must"\n                " contain partitioned assets or a partitions definition."\n            )\n\n        partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n        time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n        return ScheduleDefinition(\n            job=resolved_job,\n            name=self.name,\n            execution_fn=_get_schedule_evaluation_fn(partitions_def, resolved_job, self.tags),\n            execution_timezone=time_partitions_def.timezone,\n            cron_schedule=time_partitions_def.get_cron_schedule(\n                self.minute_of_hour, self.hour_of_day, self.day_of_week, self.day_of_month\n            ),\n        )\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n tags: Optional[Mapping[str, str]] = None,\n) -> Union[UnresolvedPartitionedAssetScheduleDefinition, ScheduleDefinition]:\n """Creates a schedule from a time window-partitioned job or a job that targets\n time window-partitioned assets. The job can also be multipartitioned, as long as one\n of the partitions dimensions is time-partitioned.\n\n The schedule executes at the cadence specified by the time partitioning of the job or assets.\n\n Examples:\n .. code-block:: python\n\n ######################################\n # Job that targets partitioned assets\n ######################################\n\n from dagster import (\n DailyPartitionsDefinition,\n asset,\n build_schedule_from_partitioned_job,\n define_asset_job,\n )\n\n @asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def asset1():\n ...\n\n asset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n # The created schedule will fire daily\n asset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\n defs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n ################\n # Non-asset job\n ################\n\n from dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n @job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def do_stuff_partitioned():\n ...\n\n # The created schedule will fire daily\n do_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n do_stuff_partitioned,\n )\n\n defs = Definitions(schedules=[do_stuff_partitioned_schedule])\n """\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to"\n " build_schedule_from_partitioned_job.",\n )\n\n if isinstance(job, UnresolvedAssetJobDefinition) and job.partitions_def is None:\n return UnresolvedPartitionedAssetScheduleDefinition(\n job=job,\n default_status=default_status,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n minute_of_hour=minute_of_hour,\n hour_of_day=hour_of_day,\n day_of_week=day_of_week,\n day_of_month=day_of_month,\n tags=tags,\n )\n else:\n partitions_def = job.partitions_def\n if partitions_def is None:\n check.failed("The provided job is not partitioned")\n\n partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n return schedule(\n cron_schedule=time_partitions_def.get_cron_schedule(\n minute_of_hour, hour_of_day, day_of_week, day_of_month\n ),\n job=job,\n default_status=default_status,\n execution_timezone=time_partitions_def.timezone,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n )(_get_schedule_evaluation_fn(partitions_def, job, tags))
\n\n\ndef _get_schedule_evaluation_fn(\n partitions_def: PartitionsDefinition,\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n tags: Optional[Mapping[str, str]] = None,\n) -> Callable[[ScheduleEvaluationContext], Union[SkipReason, RunRequest, RunRequestIterator]]:\n def schedule_fn(context):\n # Run for the latest partition. Prior partitions will have been handled by prior ticks.\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n partition_key = partitions_def.get_last_partition_key(context.scheduled_execution_time)\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return job.run_request_for_partition(\n partition_key=partition_key,\n run_key=partition_key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n )\n else:\n check.invariant(isinstance(partitions_def, MultiPartitionsDefinition))\n time_window_dimension = partitions_def.time_window_dimension\n partition_key = time_window_dimension.partitions_def.get_last_partition_key(\n context.scheduled_execution_time\n )\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return [\n job.run_request_for_partition(\n partition_key=key,\n run_key=key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n for key in partitions_def.get_multipartition_keys_with_dimension_value(\n time_window_dimension.name,\n partition_key,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n ]\n\n return schedule_fn\n\n\ndef _check_valid_schedule_partitions_def(\n partitions_def: PartitionsDefinition,\n) -> Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition]:\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise DagsterInvalidDefinitionError(\n "Tried to build a partitioned schedule from an asset job, but received an invalid"\n " partitions definition. The permitted partitions definitions are: \\n1."\n " TimeWindowPartitionsDefinition\\n2. MultiPartitionsDefinition with a single"\n " TimeWindowPartitionsDefinition dimension"\n )\n\n return cast(Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def)\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n
", "current_page_name": "_modules/dagster/_core/definitions/partitioned_schedule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partitioned_schedule"}, "policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", PublicAttr[int]),\n ("delay", PublicAttr[Optional[check.Numeric]]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", PublicAttr[Optional[Backoff]]),\n ("jitter", PublicAttr[Optional[Jitter]]),\n ],\n ),\n):\n """A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(\n attempt_num: int, backoff: Optional[Backoff], jitter: Optional[Jitter], base_delay: float\n) -> float:\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/_core/definitions/policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.policy"}, "reconstruct": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.reconstruct

\nimport inspect\nimport json\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import experimental\nfrom dagster._core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    JobPythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster._serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\nfrom dagster._utils import hash_collection\n\nfrom .events import AssetKey\nfrom .job_base import IJob\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.repository_definition import (\n        PendingRepositoryDefinition,\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.source_asset import SourceAsset\n\n    from .graph_definition import GraphDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(job_name: str) -> str:\n    check.str_param(job_name, "job_name")\n    return f"__repository__{job_name}"\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", Sequence[str]),\n            ("container_context", Optional[Mapping[str, Any]]),\n            ("repository_load_data", Optional["RepositoryLoadData"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer: CodePointer,\n        container_image: Optional[str] = None,\n        executable_path: Optional[str] = None,\n        entry_point: Optional[Sequence[str]] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n        repository_load_data: Optional["RepositoryLoadData"] = None,\n    ):\n        from dagster._core.definitions.repository_definition import RepositoryLoadData\n\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                check.sequence_param(entry_point, "entry_point", of_type=str)\n                if entry_point is not None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n            container_context=(\n                check.mapping_param(container_context, "container_context")\n                if container_context is not None\n                else None\n            ),\n            repository_load_data=check.opt_inst_param(\n                repository_load_data, "repository_load_data", RepositoryLoadData\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableRepository":\n        return self._replace(repository_load_data=metadata)\n\n    def get_definition(self) -> "RepositoryDefinition":\n        return repository_def_from_pointer(self.pointer, self.repository_load_data)\n\n    def get_reconstructable_job(self, name: str) -> "ReconstructableJob":\n        return ReconstructableJob(self, name)\n\n    @classmethod\n    def for_file(\n        cls,\n        file: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(\n            FileCodePointer(file, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    @classmethod\n    def for_module(\n        cls,\n        module: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        return cls(\n            ModuleCodePointer(module, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    def get_python_origin(self) -> RepositoryPythonOrigin:\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n            container_context=self.container_context,\n        )\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has `Sequence` attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\nclass ReconstructableJobSerializer(NamedTupleSerializer):\n    def before_unpack(self, _, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n        solid_selection_str = unpacked_dict.get("solid_selection_str")\n        solids_to_execute = unpacked_dict.get("solids_to_execute")\n        if solid_selection_str:\n            unpacked_dict["op_selection"] = json.loads(solid_selection_str)\n        elif solids_to_execute:\n            unpacked_dict["op_selection"] = solids_to_execute\n        return unpacked_dict\n\n    def after_pack(self, **packed_dict: Any) -> Dict[str, Any]:\n        if packed_dict["op_selection"]:\n            packed_dict["solid_selection_str"] = json.dumps(packed_dict["op_selection"]["__set__"])\n        else:\n            packed_dict["solid_selection_str"] = None\n        del packed_dict["op_selection"]\n        return packed_dict\n\n\n@whitelist_for_serdes(\n    serializer=ReconstructableJobSerializer,\n    storage_name="ReconstructablePipeline",\n    storage_field_names={\n        "job_name": "pipeline_name",\n    },\n)\nclass ReconstructableJob(\n    NamedTuple(\n        "_ReconstructableJob",\n        [\n            ("repository", ReconstructableRepository),\n            ("job_name", str),\n            ("op_selection", Optional[AbstractSet[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    ),\n    IJob,\n):\n    """Defines a reconstructable job. When your job must cross process boundaries, Dagster must know\n    how to reconstruct the job on the other side of the process boundary.\n\n    Args:\n        repository (ReconstructableRepository): The reconstructable representation of the repository\n            the job belongs to.\n        job_name (str): The name of the job.\n        op_selection (Optional[AbstractSet[str]]): A set of op query strings. Ops matching any of\n            these queries will be selected. None if no selection is specified.\n        asset_selection (Optional[AbstractSet[AssetKey]]) A set of assets to execute. None if no selection\n            is specified, i.e. the entire job will be run.\n    """\n\n    def __new__(\n        cls,\n        repository: ReconstructableRepository,\n        job_name: str,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ):\n        op_selection = set(op_selection) if op_selection else None\n        return super(ReconstructableJob, cls).__new__(\n            cls,\n            repository=check.inst_param(repository, "repository", ReconstructableRepository),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_set_param(op_selection, "op_selection", of_type=str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableJob":\n        return self._replace(repository=self.repository.with_repository_load_data(metadata))\n\n    # Keep the most recent 1 definition (globally since this is a NamedTuple method)\n    # This allows repeated calls to get_definition in execution paths to not reload the job\n    @lru_cache(maxsize=1)\n    def get_definition(self) -> "JobDefinition":\n        return self.repository.get_definition().get_maybe_subset_job_def(\n            self.job_name,\n            self.op_selection,\n            self.asset_selection,\n            self.asset_check_selection,\n        )\n\n    def get_reconstructable_repository(self) -> ReconstructableRepository:\n        return self.repository\n\n    def get_subset(\n        self,\n        *,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ) -> Self:\n        if op_selection and (asset_selection or asset_check_selection):\n            check.failed(\n                "op_selection and asset_selection or asset_check_selection cannot both be provided"\n                " as arguments",\n            )\n        op_selection = set(op_selection) if op_selection else None\n        return ReconstructableJob(\n            repository=self.repository,\n            job_name=self.job_name,\n            op_selection=op_selection,\n            asset_selection=asset_selection,\n            asset_check_selection=asset_check_selection,\n        )\n\n    def describe(self) -> str:\n        return f'"{self.job_name}" in repository ({self.repository.pointer.describe})'\n\n    @staticmethod\n    def for_file(python_file: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(FileCodePointer(python_file, fn_name, os.getcwd()))\n\n    @staticmethod\n    def for_module(module: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n    def to_dict(self) -> Mapping[str, object]:\n        return pack_value(self)\n\n    @staticmethod\n    def from_dict(val: Mapping[str, Any]) -> "ReconstructableJob":\n        check.mapping_param(val, "val")\n\n        inst = unpack_value(val)\n        check.invariant(\n            isinstance(inst, ReconstructableJob),\n            f"Deserialized object is not instance of ReconstructableJob, got {type(inst)}",\n        )\n        return inst  # type: ignore  # (illegible runtime check)\n\n    def get_python_origin(self) -> JobPythonOrigin:\n        return JobPythonOrigin(self.job_name, self.repository.get_python_origin())\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    def get_module(self) -> Optional[str]:\n        """Return the module the job is found in, the origin is a module code pointer."""\n        pointer = self.get_python_origin().get_repo_pointer()\n        if isinstance(pointer, ModuleCodePointer):\n            return pointer.module\n\n        return None\n\n    # Allow this to be hashed for `lru_cache` in `get_definition`\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]def reconstructable(target: Callable[..., "JobDefinition"]) -> ReconstructableJob:\n """Create a :py:class:`~dagster._core.definitions.reconstructable.ReconstructableJob` from a\n function that returns a :py:class:`~dagster.JobDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@job <dagster.job>`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster._core.definitions import JobDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, JobDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n f"by a decorated function, got {type(target)}.",\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n f'Reconstructable target "{target.__name__}" has a different '\n f'__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job."\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and getattr(inspect.getmodule(target), "__name__", None) != "__main__"\n ):\n return ReconstructableJob.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a job defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_job(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name: str,\n reconstructor_function_name: str,\n reconstructable_args: Optional[Tuple[object]] = None,\n reconstructable_kwargs: Optional[Mapping[str, object]] = None,\n reconstructor_working_directory: Optional[str] = None,\n) -> ReconstructableJob:\n """Create a :py:class:`dagster._core.definitions.reconstructable.ReconstructableJob`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n _reconstructable_args: List[object] = list(\n check.opt_tuple_param(reconstructable_args, "reconstructable_args")\n )\n _reconstructable_kwargs: List[List[Union[str, object]]] = list(\n (\n [key, value]\n for key, value in check.opt_mapping_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, _reconstructable_args, _reconstructable_kwargs)\n\n job_def = job_def_from_pointer(pointer)\n\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )
\n\n\ndef bootstrap_standalone_recon_job(pointer: CodePointer) -> ReconstructableJob:\n # So this actually straps the the job for the sole\n # purpose of getting the job name. If we changed ReconstructableJob\n # to get the job on demand in order to get name, we could avoid this.\n job_def = job_def_from_pointer(pointer)\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )\n\n\nLoadableDefinition: TypeAlias = Union[\n "JobDefinition",\n "RepositoryDefinition",\n "PendingRepositoryDefinition",\n "GraphDefinition",\n "Sequence[Union[AssetsDefinition, SourceAsset]]",\n]\n\nT_LoadableDefinition = TypeVar("T_LoadableDefinition", bound=LoadableDefinition)\n\n\ndef _is_list_of_assets(\n definition: LoadableDefinition,\n) -> bool:\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n return isinstance(definition, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in definition\n )\n\n\ndef _check_is_loadable(definition: T_LoadableDefinition) -> T_LoadableDefinition:\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if not (\n isinstance(\n definition,\n (\n JobDefinition,\n RepositoryDefinition,\n PendingRepositoryDefinition,\n GraphDefinition,\n Definitions,\n ),\n )\n or _is_list_of_assets(definition)\n ):\n raise DagsterInvariantViolationError(\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"or RepositoryDefinition. Got {definition!r}."\n )\n return definition\n\n\ndef load_def_in_module(\n module_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(\n package_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(\n python_file: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> LoadableDefinition:\n target = pointer.load_target()\n\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if isinstance(\n target,\n (\n GraphDefinition,\n JobDefinition,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n ),\n ) or not callable(target):\n return _check_is_loadable(target) # type: ignore\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_arg_names(target):\n raise DagsterInvariantViolationError(\n f"Error invoking function at {pointer.describe()} with no arguments. "\n "Reconstructable target must be callable with no arguments"\n )\n\n return _check_is_loadable(target())\n\n\ndef job_def_from_pointer(pointer: CodePointer) -> "JobDefinition":\n from .job_definition import JobDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, JobDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or JobDefinition for legacy"\n " code). Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\ndef repository_def_from_target_def(\n target: Union["RepositoryDefinition", "JobDefinition", "GraphDefinition"],\n repository_load_data: Optional["RepositoryLoadData"] = None,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> None: ...\n\n\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> Optional["RepositoryDefinition"]:\n from .assets import AssetsDefinition\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import (\n SINGLETON_REPOSITORY_NAME,\n CachingRepositoryData,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n )\n from .source_asset import SourceAsset\n\n if isinstance(target, Definitions):\n # reassign to handle both repository and pending repo case\n target = target.get_inner_repository_for_loading_process()\n\n # special case - we can wrap a single job in a repository\n if isinstance(target, (JobDefinition, GraphDefinition)):\n # consider including job name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in target\n ):\n return RepositoryDefinition(\n name=SINGLETON_REPOSITORY_NAME,\n repository_data=CachingRepositoryData.from_list(target),\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n elif isinstance(target, PendingRepositoryDefinition):\n # must load repository from scratch\n if repository_load_data is None:\n return target.compute_repository_definition()\n # can use the cached data to more efficiently load data\n return target.reconstruct_repository_definition(repository_load_data)\n else:\n return None\n\n\ndef repository_def_from_pointer(\n pointer: CodePointer, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target, repository_load_data)\n if not repo_def:\n raise DagsterInvariantViolationError(\n f"CodePointer ({pointer.describe()}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or JobDefinition. "\n f"Received a {type(target)}"\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/_core/definitions/reconstruct", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.reconstruct"}, "repository_definition": {"repository_data": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_data

\nfrom abc import ABC, abstractmethod\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.graph_definition import SubselectedGraphDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\n\nfrom .caching_index import CacheingDefinitionIndex\nfrom .valid_definitions import RepositoryListDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\n
[docs]class RepositoryData(ABC):\n """Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n pass\n\n @abstractmethod\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n """Return all top-level resources in the repository as a list,\n such as those provided to the Definitions constructor.\n\n Returns:\n List[ResourceDefinition]: All top-level resources in the repository.\n """\n\n @abstractmethod\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n pass\n\n
[docs] @abstractmethod\n @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """
\n\n
[docs] @public\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]
\n\n
[docs] @public\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()
\n\n
[docs] @public\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match
\n\n
[docs] @public\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]
\n\n
[docs] @public\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All jobs in the repository.\n """\n return []
\n\n
[docs] @public\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]
\n\n
[docs] @public\n def has_schedule(self, schedule_name: str) -> bool:\n """Check if a schedule with a given name is present in the repository."""\n return schedule_name in self.get_schedule_names()
\n\n
[docs] @public\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: Return all sensors in the repository as a list."""\n return []
\n\n
[docs] @public\n def get_sensor_names(self) -> Sequence[str]:\n """Sequence[str]: Get the names of all sensors in the repository."""\n return [sensor.name for sensor in self.get_all_sensors()]
\n\n
[docs] @public\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n """Get a sensor by name.\n\n Args:\n sensor_name (str): name of the sensor to retrieve.\n\n Returns:\n SensorDefinition: The sensor definition corresponding to the given name.\n """\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]
\n\n
[docs] @public\n def has_sensor(self, sensor_name: str) -> bool:\n """Check if a sensor with a given name is present in the repository."""\n return sensor_name in self.get_sensor_names()
\n\n
[docs] @public\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n """Mapping[AssetKey, SourceAsset]: Get the source assets for the repository."""\n return {}
\n\n
[docs] @public\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n """Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository."""\n return {}
\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self.get_all_jobs()\n self.get_all_schedules()\n self.get_all_sensors()\n self.get_source_assets_by_key()
\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[Sequence[JobDefinition]]\n _all_pipelines: Optional[Sequence[JobDefinition]]\n\n def __init__(\n self,\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n assets_defs_by_key: Mapping[AssetKey, "AssetsDefinition"],\n top_level_resources: Mapping[str, ResourceDefinition],\n utilized_env_vars: Mapping[str, AbstractSet[str]],\n resource_key_mapping: Mapping[int, str],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, and schedule definitions directly, or you may pass callables\n with no arguments that will be invoked to lazily construct definitions when accessed by\n name. This can be helpful for performance when there are many definitions in a repository,\n or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets_by_key (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n assets_defs_by_key (Mapping[AssetKey, AssetsDefinition]): The assets definitions\n belonging to a repository.\n top_level_resources (Mapping[str, ResourceDefinition]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from dagster._core.definitions import AssetsDefinition\n\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets_by_key, "source_assets_by_key", key_type=AssetKey, value_type=SourceAsset\n )\n check.mapping_param(\n assets_defs_by_key, "assets_defs_by_key", key_type=AssetKey, value_type=AssetsDefinition\n )\n check.mapping_param(\n top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n )\n check.mapping_param(\n utilized_env_vars,\n "utilized_resources",\n key_type=str,\n )\n check.mapping_param(\n resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n )\n\n self._jobs = CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n # load all schedules to force validation\n self._schedules.get_all_definitions()\n\n self._source_assets_by_key = source_assets_by_key\n self._assets_defs_by_key = assets_defs_by_key\n self._top_level_resources = top_level_resources\n self._utilized_env_vars = utilized_env_vars\n self._resource_key_mapping = resource_key_mapping\n\n self._sensors = CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n from .repository_data_builder import build_caching_repository_data_from_dict\n\n return build_caching_repository_data_from_dict(repository_definitions)\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: Sequence[RepositoryListDefinition],\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n resource_key_mapping: Optional[Mapping[int, str]] = None,\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[JobDefinition, ScheduleDefinition, SensorDefinition, GraphDefinition]]):\n Use this constructor when you have no need to lazy load jobs or other definitions.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from .repository_data_builder import build_caching_repository_data_from_list\n\n return build_caching_repository_data_from_list(\n repository_definitions=repository_definitions,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=top_level_resources,\n resource_key_mapping=resource_key_mapping,\n )\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._utilized_env_vars\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._resource_key_mapping\n\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._top_level_resources\n\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n self._all_jobs = self._jobs.get_all_definitions()\n self._check_node_defs(self._all_jobs)\n return self._all_jobs\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> Sequence[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets_by_key\n\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._assets_defs_by_key\n\n def _check_node_defs(self, job_defs: Sequence[JobDefinition]) -> None:\n node_defs = {}\n node_to_job = {}\n for job_def in job_defs:\n for node_def in [*job_def.all_node_defs, job_def.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(node_def, SubselectedGraphDefinition):\n break\n\n if node_def.name not in node_defs:\n node_defs[node_def.name] = node_def\n node_to_job[node_def.name] = job_def.name\n\n if node_defs[node_def.name] is not node_def:\n first_name, second_name = sorted([node_to_job[node_def.name], job_def.name])\n raise DagsterInvalidDefinitionError(\n f"Conflicting definitions found in repository with name '{node_def.name}'."\n " Op/Graph definition names must be unique within a repository."\n f" {node_def.__class__.__name__} is defined in"\n f" job '{first_name}' and in"\n f" job '{second_name}'."\n )\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n job_names = self.get_job_names()\n\n if schedule.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job "{schedule.job_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n job_names = self.get_job_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a job\n return sensor\n\n for target in sensor.targets:\n if target.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job "{sensor.job_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_data", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_data"}, "repository_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.asset_graph import AssetGraph, InternalAssetGraph\nfrom dagster._core.definitions.assets_job import (\n    ASSET_BASE_JOB_PREFIX,\n)\nfrom dagster._core.definitions.cacheable_assets import AssetsDefinitionCacheableData\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.metadata import MetadataMapping\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import hash_collection\n\nfrom .repository_data import CachingRepositoryData, RepositoryData\nfrom .valid_definitions import (\n    RepositoryListDefinition as RepositoryListDefinition,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n    from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n@whitelist_for_serdes\nclass RepositoryLoadData(\n    NamedTuple(\n        "_RepositoryLoadData",\n        [\n            ("cached_data_by_key", Mapping[str, Sequence[AssetsDefinitionCacheableData]]),\n        ],\n    )\n):\n    def __new__(cls, cached_data_by_key: Mapping[str, Sequence[AssetsDefinitionCacheableData]]):\n        return super(RepositoryLoadData, cls).__new__(\n            cls,\n            cached_data_by_key=(\n                check.mapping_param(\n                    cached_data_by_key,\n                    "cached_data_by_key",\n                    key_type=str,\n                    value_type=list,\n                )\n            ),\n        )\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has a `RepositoryLoadData` attribute\n    # - `RepositoryLoadData` has collection attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[MetadataMapping]): A map of arbitrary metadata for the repository.\n """\n\n def __init__(\n self,\n name,\n *,\n repository_data,\n description=None,\n metadata=None,\n repository_load_data=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data: RepositoryData = check.inst_param(\n repository_data, "repository_data", RepositoryData\n )\n self._metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self._repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n @property\n def repository_load_data(self) -> Optional[RepositoryLoadData]:\n return self._repository_load_data\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the repository."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the repository."""\n return self._description\n\n @public\n @property\n def metadata(self) -> Optional[MetadataMapping]:\n """Optional[MetadataMapping]: Arbitrary metadata for the repository."""\n return self._metadata\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self._repository_data.load_all_definitions()\n\n @public\n @property\n def job_names(self) -> Sequence[str]:\n """List[str]: Names of all jobs in the repository."""\n return self._repository_data.get_job_names()\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._repository_data.get_top_level_resources()\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._repository_data.get_env_vars_by_top_level_resource()\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._repository_data.get_resource_key_mapping()\n\n
[docs] @public\n def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] @public\n def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @public\n @property\n def schedule_defs(self) -> Sequence[ScheduleDefinition]:\n """List[ScheduleDefinition]: All schedules in the repository."""\n return self._repository_data.get_all_schedules()\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name.\n\n Args:\n name (str): The name of the schedule.\n\n Returns:\n ScheduleDefinition: The schedule definition.\n """\n return self._repository_data.get_schedule(name)
\n\n
[docs] @public\n def has_schedule_def(self, name: str) -> bool:\n """bool: Check if a schedule with a given name is present in the repository."""\n return self._repository_data.has_schedule(name)
\n\n @public\n @property\n def sensor_defs(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: All sensors in the repository."""\n return self._repository_data.get_all_sensors()\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name.\n\n Args:\n name (str): The name of the sensor.\n\n Returns:\n SensorDefinition: The sensor definition.\n """\n return self._repository_data.get_sensor(name)
\n\n
[docs] @public\n def has_sensor_def(self, name: str) -> bool:\n """bool: Check if a sensor with a given name is present in the repository."""\n return self._repository_data.has_sensor(name)
\n\n @property\n def source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._repository_data.get_assets_defs_by_key()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n """Returns true is there is a single implicit asset job for all asset keys in a repository."""\n return self.has_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method for repositories where there are a set of assets with\n the same partitioning schema and one wants to access their corresponding implicit job\n easily.\n """\n if not self.has_job(ASSET_BASE_JOB_PREFIX):\n raise DagsterInvariantViolationError(\n "There is no single global asset job, likely due to assets using "\n "different partitioning schemes via their partitions_def parameter. You must "\n "use get_implicit_job_def_for_assets in order to access the correct implicit job."\n )\n\n return self.get_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_asset_job_names(self) -> Sequence[str]:\n return [\n job_name for job_name in self.job_names if job_name.startswith(ASSET_BASE_JOB_PREFIX)\n ]\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n """Returns the asset base job that contains all the given assets, or None if there is no such\n job.\n """\n if self.has_job(ASSET_BASE_JOB_PREFIX):\n base_job = self.get_job(ASSET_BASE_JOB_PREFIX)\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n else:\n i = 0\n while self.has_job(f"{ASSET_BASE_JOB_PREFIX}_{i}"):\n base_job = self.get_job(f"{ASSET_BASE_JOB_PREFIX}_{i}")\n\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n\n i += 1\n\n return None\n\n def get_maybe_subset_job_def(\n self,\n job_name: str,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ):\n defn = self.get_job(job_name)\n return defn.get_subset(\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n )\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Any] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n with AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n ) as loader:\n return loader.load_asset_value(\n asset_key,\n python_type=python_type,\n partition_key=partition_key,\n metadata=metadata,\n resource_config=resource_config,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with my_repo.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n return AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n )
\n\n @property\n def asset_graph(self) -> InternalAssetGraph:\n return AssetGraph.from_assets(\n [*set(self.assets_defs_by_key.values()), *self.source_assets_by_key.values()]\n )\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n\n\nclass PendingRepositoryDefinition:\n def __init__(\n self,\n name: str,\n repository_definitions: Sequence[\n Union[RepositoryListDefinition, "CacheableAssetsDefinition"]\n ],\n description: Optional[str] = None,\n metadata: Optional[MetadataMapping] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n ):\n self._repository_definitions = check.list_param(\n repository_definitions,\n "repository_definition",\n additional_message=(\n "PendingRepositoryDefinition supports only list-based repository data at this time."\n ),\n )\n self._name = name\n self._description = description\n self._metadata = metadata\n self._default_logger_defs = default_logger_defs\n self._default_executor_def = default_executor_def\n self._top_level_resources = _top_level_resources\n self._resource_key_mapping = _resource_key_mapping\n\n @property\n def name(self) -> str:\n return self._name\n\n def _compute_repository_load_data(self) -> RepositoryLoadData:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n return RepositoryLoadData(\n cached_data_by_key={\n defn.unique_id: defn.compute_cacheable_data()\n for defn in self._repository_definitions\n if isinstance(defn, CacheableAssetsDefinition)\n }\n )\n\n def _get_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n resolved_definitions: List[RepositoryListDefinition] = []\n for defn in self._repository_definitions:\n if isinstance(defn, CacheableAssetsDefinition):\n # should always have metadata for each cached defn at this point\n check.invariant(\n defn.unique_id in repository_load_data.cached_data_by_key,\n "No metadata found for CacheableAssetsDefinition with unique_id"\n f" {defn.unique_id}.",\n )\n # use the emtadata to generate definitions\n resolved_definitions.extend(\n defn.build_definitions(\n data=repository_load_data.cached_data_by_key[defn.unique_id]\n )\n )\n else:\n resolved_definitions.append(defn)\n\n repository_data = CachingRepositoryData.from_list(\n resolved_definitions,\n default_executor_def=self._default_executor_def,\n default_logger_defs=self._default_logger_defs,\n top_level_resources=self._top_level_resources,\n resource_key_mapping=self._resource_key_mapping,\n )\n\n return RepositoryDefinition(\n self._name,\n repository_data=repository_data,\n description=self._description,\n metadata=self._metadata,\n repository_load_data=repository_load_data,\n )\n\n def reconstruct_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n """Use the provided RepositoryLoadData to construct and return a RepositoryDefinition."""\n check.inst_param(repository_load_data, "repository_load_data", RepositoryLoadData)\n return self._get_repository_definition(repository_load_data)\n\n def compute_repository_definition(self) -> RepositoryDefinition:\n """Compute the required RepositoryLoadData and use it to construct and return a RepositoryDefinition."""\n repository_load_data = self._compute_repository_load_data()\n return self._get_repository_definition(repository_load_data)\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_definition"}}, "resource_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.resource_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.decorator_utils import format_docstring_for_description\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._utils import IHasInternalInit\n\nfrom ..decorator_utils import (\n    get_function_params,\n    has_at_least_one_parameter,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\nfrom .resource_requirement import (\n    RequiresResources,\n    ResourceDependencyRequirement,\n    ResourceRequirement,\n)\nfrom .scoped_resources_builder import (  # re-exported\n    IContainsGenerator as IContainsGenerator,\n    Resources as Resources,\n    ScopedResourcesBuilder as ScopedResourcesBuilder,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.resources_init import InitResourceContext\n\nResourceFunctionWithContext: TypeAlias = Callable[["InitResourceContext"], Any]\nResourceFunctionWithoutContext: TypeAlias = Callable[[], Any]\nResourceFunction: TypeAlias = Union[\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n]\n\n\n
[docs]@experimental_param(param="version")\nclass ResourceDefinition(AnonymousConfigurableDefinition, RequiresResources, IHasInternalInit):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n ops and assets during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n\n # this attribute will be updated by the @dagster_maintained_resource and @dagster_maintained_io_manager decorators\n self._dagster_maintained = False\n self._hardcoded_resource_type = None\n\n @staticmethod\n def dagster_internal_init(\n *,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema,\n description: Optional[str],\n required_resource_keys: Optional[AbstractSet[str]],\n version: Optional[str],\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def resource_fn(self) -> ResourceFunction:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of the resource."""\n return self._description\n\n @public\n @property\n def version(self) -> Optional[str]:\n """A string which can be used to identify a particular code version of a resource definition."""\n return self._version\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """A set of the resource keys that this resource depends on. These keys will be made available\n to the resource's init context during execution, and the resource will not be instantiated\n until all required resources are available.\n """\n return self._required_resource_keys\n\n def _is_dagster_maintained(self) -> bool:\n return self._dagster_maintained\n\n
[docs] @public\n @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @public\n @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n resource_def = ResourceDefinition(\n resource_fn=lambda _init_context: value, description=description\n )\n # Make sure telemetry info gets passed in to hardcoded resources\n if hasattr(value, "_is_dagster_maintained"):\n resource_def._dagster_maintained = value._is_dagster_maintained() # noqa: SLF001\n resource_def._hardcoded_resource_type = type(value) # noqa: SLF001\n\n return resource_def
\n\n
[docs] @public\n @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n
[docs] @public\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """Creates a ``ResourceDefinition`` which takes in a single string as configuration\n and returns this configured string to any ops or assets which depend on it.\n\n Args:\n description ([Optional[str]]): The description of the string resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that takes in a single string as configuration and\n returns that string.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )
\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "ResourceDefinition":\n resource_def = ResourceDefinition.dagster_internal_init(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n resource_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return resource_def\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.init import UnboundInitResourceContext\n\n if has_at_least_one_parameter(self.resource_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was"\n " provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, UnboundInitResourceContext)\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], args[0])\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, UnboundInitResourceContext\n )\n\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], kwargs[context_param_name])\n )\n elif len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke resource with argument, but underlying function has no context"\n " argument. Either specify a context argument on the resource function, or remove"\n " the passed-in argument."\n )\n else:\n return resource_invocation_result(self, None)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n source_key = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield ResourceDependencyRequirement(key=resource_key, source_key=source_key)
\n\n\ndef dagster_maintained_resource(\n resource_def: ResourceDefinition,\n) -> ResourceDefinition:\n resource_def._dagster_maintained = True # noqa: SLF001\n return resource_def\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: ResourceFunction) -> ResourceDefinition:\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if has_at_least_one_parameter(resource_fn) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single"\n " positional required argument. Got required extra params"\n f" {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition.dagster_internal_init(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description or format_docstring_for_description(resource_fn),\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(resource_def, wrapped=resource_fn) # type: ignore\n\n return resource_def\n\n\n@overload\ndef resource(config_schema: ResourceFunction) -> ResourceDefinition: ...\n\n\n@overload\ndef resource(\n config_schema: CoercableToConfigSchema = ...,\n description: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n version: Optional[str] = ...,\n) -> Callable[[ResourceFunction], "ResourceDefinition"]: ...\n\n\n
[docs]def resource(\n config_schema: Union[ResourceFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[Callable[[ResourceFunction], "ResourceDefinition"], "ResourceDefinition"]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: ResourceFunction) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/resource_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.resource_definition"}, "result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.result

\nfrom typing import NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.data_version import DataVersion\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .metadata import MetadataUserInput\n\n\n
[docs]@experimental\nclass MaterializeResult(\n NamedTuple(\n "_MaterializeResult",\n [\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("check_results", PublicAttr[Sequence[AssetCheckResult]]),\n ("data_version", PublicAttr[Optional[DataVersion]]),\n ],\n )\n):\n """An object representing a successful materialization of an asset. These can be returned from\n @asset and @multi_asset decorated functions to pass metadata or specify specific assets were\n materialized.\n\n Attributes:\n asset_key (Optional[AssetKey]): Optional in @asset, required in @multi_asset to discern which asset this refers to.\n metadata (Optional[MetadataUserInput]): Metadata to record with the corresponding AssetMaterialization event.\n """\n\n def __new__(\n cls,\n *, # enforce kwargs\n asset_key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[MetadataUserInput] = None,\n check_results: Optional[Sequence[AssetCheckResult]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n metadata=check.opt_nullable_mapping_param(\n metadata,\n "metadata",\n key_type=str,\n ),\n check_results=check.opt_sequence_param(\n check_results, "check_results", of_type=AssetCheckResult\n ),\n data_version=check.opt_inst_param(data_version, "data_version", DataVersion),\n )\n\n def check_result_named(self, check_name: str) -> AssetCheckResult:\n for check_result in self.check_results:\n if check_result.check_name == check_name:\n return check_result\n\n check.failed(f"Could not find check result named {check_name}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.result"}, "run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_config

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nfrom dagster._config import (\n    ALL_CONFIG_BUILTINS,\n    ConfigType,\n    Field,\n    Permissive,\n    Selector,\n    Shape,\n)\nfrom dagster._config.pythonic_config import Config\nfrom dagster._core.definitions.asset_layer import AssetLayer\nfrom dagster._core.definitions.executor_definition import (\n    ExecutorDefinition,\n    execute_in_process_executor,\n    in_process_executor,\n)\nfrom dagster._core.definitions.input import InputDefinition\nfrom dagster._core.definitions.output import OutputDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.storage.input_manager import IInputManagerDefinition\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition\nfrom dagster._core.types.dagster_type import ALL_RUNTIME_BUILTINS, construct_dagster_type_dictionary\nfrom dagster._utils import check\n\nfrom .configurable import ConfigurableDefinition\nfrom .definition_config_schema import IDefinitionConfigSchema\nfrom .dependency import DependencyStructure, GraphNode, Node, NodeHandle, NodeInput, OpNode\nfrom .graph_definition import GraphDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .op_definition import NodeDefinition, OpDefinition\nfrom .resource_definition import ResourceDefinition\n\nif TYPE_CHECKING:\n    from .source_asset import SourceAsset\n\n\ndef define_resource_dictionary_cls(\n    resource_defs: Mapping[str, ResourceDefinition],\n    required_resources: AbstractSet[str],\n) -> Shape:\n    fields = {}\n    for resource_name, resource_def in resource_defs.items():\n        if resource_def.config_schema:\n            is_required = None\n            if resource_name not in required_resources:\n                # explicitly make section not required if resource is not required\n                # for the current mode\n                is_required = False\n\n            fields[resource_name] = def_config_field(\n                resource_def,\n                is_required=is_required,\n                description=resource_def.description,\n            )\n\n    return Shape(fields=fields)\n\n\ndef remove_none_entries(ddict: Mapping[Any, Any]) -> dict:\n    return {k: v for k, v in ddict.items() if v is not None}\n\n\ndef def_config_field(\n    configurable_def: ConfigurableDefinition,\n    is_required: Optional[bool] = None,\n    description: Optional[str] = None,\n) -> Field:\n    return Field(\n        Shape(\n            {"config": configurable_def.config_field} if configurable_def.has_config_field else {}\n        ),\n        is_required=is_required,\n        description=description,\n    )\n\n\nclass RunConfigSchemaCreationData(NamedTuple):\n    job_name: str\n    nodes: Sequence[Node]\n    graph_def: GraphDefinition\n    dependency_structure: DependencyStructure\n    executor_def: ExecutorDefinition\n    resource_defs: Mapping[str, ResourceDefinition]\n    logger_defs: Mapping[str, LoggerDefinition]\n    ignored_nodes: Sequence[Node]\n    required_resources: AbstractSet[str]\n    direct_inputs: Mapping[str, Any]\n    asset_layer: AssetLayer\n\n\ndef define_logger_dictionary_cls(creation_data: RunConfigSchemaCreationData) -> Shape:\n    return Shape(\n        {\n            logger_name: def_config_field(logger_definition, is_required=False)\n            for logger_name, logger_definition in creation_data.logger_defs.items()\n        }\n    )\n\n\ndef define_execution_field(executor_defs: Sequence[ExecutorDefinition], description: str) -> Field:\n    default_in_process = False\n    for executor_def in executor_defs:\n        if executor_def == in_process_executor:\n            default_in_process = True\n\n    selector = selector_for_named_defs(executor_defs)\n\n    if default_in_process:\n        return Field(\n            selector, default_value={in_process_executor.name: {}}, description=description\n        )\n\n    # If we are using the execute_in_process executor, then ignore all executor config.\n    if len(executor_defs) == 1 and executor_defs[0] == execute_in_process_executor:\n        return Field(Permissive(), is_required=False, default_value={}, description=description)\n\n    return Field(selector, description=description)\n\n\ndef define_single_execution_field(executor_def: ExecutorDefinition, description: str) -> Field:\n    return def_config_field(executor_def, description=description)\n\n\ndef define_run_config_schema_type(creation_data: RunConfigSchemaCreationData) -> ConfigType:\n    execution_field = define_single_execution_field(\n        creation_data.executor_def,\n        "Configure how steps are executed within a run.",\n    )\n\n    top_level_node = GraphNode(\n        name=creation_data.graph_def.name,\n        definition=creation_data.graph_def,\n        graph_definition=creation_data.graph_def,\n    )\n\n    fields = {\n        "execution": execution_field,\n        "loggers": Field(\n            define_logger_dictionary_cls(creation_data),\n            description="Configure how loggers emit messages within a run.",\n        ),\n        "resources": Field(\n            define_resource_dictionary_cls(\n                creation_data.resource_defs,\n                creation_data.required_resources,\n            ),\n            description="Configure how shared resources are implemented within a run.",\n        ),\n        "inputs": get_inputs_field(\n            node=top_level_node,\n            handle=NodeHandle(top_level_node.name, parent=None),\n            dependency_structure=creation_data.dependency_structure,\n            resource_defs=creation_data.resource_defs,\n            node_ignored=False,\n            direct_inputs=creation_data.direct_inputs,\n            input_source_assets={},\n            asset_layer=creation_data.asset_layer,\n        ),\n    }\n\n    if creation_data.graph_def.has_config_mapping:\n        config_schema = cast(IDefinitionConfigSchema, creation_data.graph_def.config_schema)\n        nodes_field = Field(\n            {"config": config_schema.as_field()},\n            description="Configure runtime parameters for ops or assets.",\n        )\n    else:\n        nodes_field = Field(\n            define_node_shape(\n                nodes=creation_data.nodes,\n                ignored_nodes=creation_data.ignored_nodes,\n                dependency_structure=creation_data.dependency_structure,\n                resource_defs=creation_data.resource_defs,\n                asset_layer=creation_data.asset_layer,\n                node_input_source_assets=creation_data.graph_def.node_input_source_assets,\n            ),\n            description="Configure runtime parameters for ops or assets.",\n        )\n\n    fields["ops"] = nodes_field\n\n    return Shape(\n        fields=remove_none_entries(fields),\n    )\n\n\n# Common pattern for a set of named definitions (e.g. executors)\n# to build a selector so that one of them is selected\ndef selector_for_named_defs(named_defs) -> Selector:\n    return Selector({named_def.name: def_config_field(named_def) for named_def in named_defs})\n\n\ndef get_inputs_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    node_ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n    direct_inputs: Optional[Mapping[str, Any]] = None,\n) -> Optional[Field]:\n    direct_inputs = check.opt_mapping_param(direct_inputs, "direct_inputs")\n    inputs_field_fields = {}\n    for name, inp in node.definition.input_dict.items():\n        inp_handle = NodeInput(node, inp)\n        has_upstream = input_has_upstream(dependency_structure, inp_handle, node, name)\n        if inp.input_manager_key:\n            input_field = get_input_manager_input_field(node, inp, resource_defs)\n        elif (\n            # if you have asset definitions, input will be loaded from the source asset\n            asset_layer.has_assets_defs\n            or asset_layer.has_asset_check_defs\n            and asset_layer.asset_key_for_input(handle, name)\n            and not has_upstream\n        ):\n            input_field = None\n        elif name in direct_inputs and not has_upstream:\n            input_field = None\n        elif name in input_source_assets and not has_upstream:\n            input_field = None\n        elif inp.dagster_type.loader and not has_upstream:\n            input_field = get_type_loader_input_field(node, name, inp)\n        else:\n            input_field = None\n\n        if input_field:\n            inputs_field_fields[name] = input_field\n\n    if not inputs_field_fields:\n        return None\n    if node_ignored:\n        return Field(\n            Shape(inputs_field_fields),\n            is_required=False,\n            description=(\n                "This op is not present in the current op selection, "\n                "the input config values are allowed but ignored."\n            ),\n        )\n    else:\n        return Field(Shape(inputs_field_fields))\n\n\ndef input_has_upstream(\n    dependency_structure: DependencyStructure,\n    input_handle: NodeInput,\n    node: Node,\n    input_name: str,\n) -> bool:\n    return dependency_structure.has_deps(input_handle) or node.container_maps_input(input_name)\n\n\ndef get_input_manager_input_field(\n    node: Node,\n    input_def: InputDefinition,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    if input_def.input_manager_key:\n        if input_def.input_manager_key not in resource_defs:\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key"\n                f" '{input_def.input_manager_key}', but no resource has been provided. Please"\n                " include a resource definition for that key in the provided resource_defs."\n            )\n\n        input_manager = resource_defs[input_def.input_manager_key]\n        if not isinstance(input_manager, IInputManagerDefinition):\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key "\n                f"'{input_def.input_manager_key}', but the resource definition provided is not an "\n                "IInputManagerDefinition"\n            )\n\n        input_config_schema = input_manager.input_config_schema\n        if input_config_schema:\n            return input_config_schema.as_field()\n        return None\n\n    return None\n\n\ndef get_type_loader_input_field(node: Node, input_name: str, input_def: InputDefinition) -> Field:\n    loader = check.not_none(input_def.dagster_type.loader)\n    return Field(\n        loader.schema_type,\n        is_required=(not node.definition.input_has_default(input_name)),\n    )\n\n\ndef get_outputs_field(\n    node: Node,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    output_manager_fields = {}\n    for name, output_def in node.definition.output_dict.items():\n        output_manager_output_field = get_output_manager_output_field(\n            node, output_def, resource_defs\n        )\n        if output_manager_output_field:\n            output_manager_fields[name] = output_manager_output_field\n\n    return Field(Shape(output_manager_fields)) if output_manager_fields else None\n\n\ndef get_output_manager_output_field(\n    node: Node, output_def: OutputDefinition, resource_defs: Mapping[str, ResourceDefinition]\n) -> Optional[ConfigType]:\n    if output_def.io_manager_key not in resource_defs:\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but no resource has been provided. Please include a '\n            "resource definition for that key in the provided resource_defs."\n        )\n    if not isinstance(resource_defs[output_def.io_manager_key], IOutputManagerDefinition):\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but the resource definition provided is not an '\n            "IOutputManagerDefinition"\n        )\n    output_manager_def = resource_defs[output_def.io_manager_key]\n    if (\n        output_manager_def\n        and isinstance(output_manager_def, IOutputManagerDefinition)\n        and output_manager_def.output_config_schema\n    ):\n        return output_manager_def.output_config_schema.as_field()\n\n    return None\n\n\ndef node_config_field(fields: Mapping[str, Optional[Field]], ignored: bool) -> Optional[Field]:\n    trimmed_fields = remove_none_entries(fields)\n    if trimmed_fields:\n        if ignored:\n            return Field(\n                Shape(trimmed_fields),\n                is_required=False,\n                description=(\n                    "This op is not present in the current op selection, "\n                    "the config values are allowed but ignored."\n                ),\n            )\n        else:\n            return Field(Shape(trimmed_fields))\n    else:\n        return None\n\n\ndef construct_leaf_node_config(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    config_schema: Optional[IDefinitionConfigSchema],\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    return node_config_field(\n        {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "config": config_schema.as_field() if config_schema else None,\n        },\n        ignored=ignored,\n    )\n\n\ndef define_node_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    # All nodes regardless of compositing status get the same inputs and outputs\n    # config. The only thing the varies is on extra element of configuration\n    # 1) Vanilla op definition: a 'config' key with the config_schema as the value\n    # 2) Graph with field mapping: a 'config' key with the config_schema of\n    #    the config mapping (via GraphDefinition#config_schema)\n    # 3) Graph without field mapping: an 'ops' key with recursively defined\n    #    ops dictionary\n    # 4) `configured` graph with field mapping: a 'config' key with the config_schema that was\n    #    provided when `configured` was called (via GraphDefinition#config_schema)\n\n    assert isinstance(node, (OpNode, GraphNode)), f"Invalid node type: {type(node)}"\n\n    if isinstance(node, OpNode):\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            node.definition.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n\n    graph_def = node.definition\n\n    if graph_def.has_config_mapping:\n        # has_config_mapping covers cases 2 & 4 from above (only config mapped graphs can\n        # be `configured`)...\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            # ...and in both cases, the correct schema for 'config' key is exposed by this property:\n            graph_def.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n        # This case omits an 'ops' key, thus if a graph is `configured` or has a field\n        # mapping, the user cannot stub any config, inputs, or outputs for inner (child) nodes.\n    else:\n        fields = {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "ops": Field(\n                define_node_shape(\n                    nodes=graph_def.nodes,\n                    ignored_nodes=None,\n                    dependency_structure=graph_def.dependency_structure,\n                    parent_handle=handle,\n                    resource_defs=resource_defs,\n                    asset_layer=asset_layer,\n                    node_input_source_assets=graph_def.node_input_source_assets,\n                )\n            ),\n        }\n\n        return node_config_field(fields, ignored=ignored)\n\n\ndef define_node_shape(\n    nodes: Sequence[Node],\n    ignored_nodes: Optional[Sequence[Node]],\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    asset_layer: AssetLayer,\n    node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]],\n    parent_handle: Optional[NodeHandle] = None,\n) -> Shape:\n    """Examples of what this method is used to generate the schema for:\n    1.\n        inputs: ...\n        ops:\n      >    op1: ...\n      >    op2: ...\n\n    2.\n        inputs:\n        ops:\n          graph1: ...\n            inputs: ...\n            ops:\n      >       op1: ...\n      >       inner_graph: ...\n\n\n    """\n    ignored_nodes = check.opt_sequence_param(ignored_nodes, "ignored_nodes", of_type=Node)\n\n    fields = {}\n    for node in nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=False,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n\n        if node_field:\n            fields[node.name] = node_field\n\n    for node in ignored_nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=True,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n        if node_field:\n            fields[node.name] = node_field\n\n    return Shape(fields)\n\n\ndef iterate_node_def_config_types(node_def: NodeDefinition) -> Iterator[ConfigType]:\n    if isinstance(node_def, OpDefinition):\n        if node_def.has_config_field:\n            yield from node_def.get_config_field().config_type.type_iterator()\n    elif isinstance(node_def, GraphDefinition):\n        for node in node_def.nodes:\n            yield from iterate_node_def_config_types(node.definition)\n\n    else:\n        check.invariant(f"Unexpected NodeDefinition type {type(node_def)}")\n\n\ndef _gather_all_schemas(node_defs: Sequence[NodeDefinition]) -> Iterator[ConfigType]:\n    dagster_types = construct_dagster_type_dictionary(node_defs)\n    for dagster_type in list(dagster_types.values()) + list(ALL_RUNTIME_BUILTINS):\n        if dagster_type.loader:\n            yield from dagster_type.loader.schema_type.type_iterator()\n\n\ndef _gather_all_config_types(\n    node_defs: Sequence[NodeDefinition], run_config_schema_type: ConfigType\n) -> Iterator[ConfigType]:\n    for node_def in node_defs:\n        yield from iterate_node_def_config_types(node_def)\n\n    yield from run_config_schema_type.type_iterator()\n\n\ndef construct_config_type_dictionary(\n    node_defs: Sequence[NodeDefinition],\n    run_config_schema_type: ConfigType,\n) -> Tuple[Mapping[str, ConfigType], Mapping[str, ConfigType]]:\n    type_dict_by_name = {t.given_name: t for t in ALL_CONFIG_BUILTINS if t.given_name}\n    type_dict_by_key = {t.key: t for t in ALL_CONFIG_BUILTINS}\n    all_types = list(_gather_all_config_types(node_defs, run_config_schema_type)) + list(\n        _gather_all_schemas(node_defs)\n    )\n\n    for config_type in all_types:\n        name = config_type.given_name\n        if name and name in type_dict_by_name:\n            if type(config_type) is not type(type_dict_by_name[name]):\n                raise DagsterInvalidDefinitionError(\n                    "Type names must be unique. You have constructed two different "\n                    f'instances of types with the same name "{name}".'\n                )\n        elif name:\n            type_dict_by_name[name] = config_type\n\n        type_dict_by_key[config_type.key] = config_type\n\n    return type_dict_by_name, type_dict_by_key\n\n\ndef _convert_config_classes_inner(configs: Any) -> Any:\n    if not isinstance(configs, dict):\n        return configs\n\n    return {\n        k: (\n            {"config": v._convert_to_config_dictionary()}  # noqa: SLF001\n            if isinstance(v, Config)\n            else _convert_config_classes_inner(v)\n        )\n        for k, v in configs.items()\n    }\n\n\ndef _convert_config_classes(configs: Dict[str, Any]) -> Dict[str, Any]:\n    return _convert_config_classes_inner(configs)\n\n\n
[docs]class RunConfig:\n """Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\n for op and asset config and resources and converts them under the hood to the appropriate config dictionaries.\n\n Example usage:\n\n .. code-block:: python\n\n class MyAssetConfig(Config):\n a_str: str\n\n @asset\n def my_asset(config: MyAssetConfig):\n assert config.a_str == "foo"\n\n materialize(\n [my_asset],\n run_config=RunConfig(\n ops={"my_asset": MyAssetConfig(a_str="foo")}\n )\n )\n\n """\n\n def __init__(\n self,\n ops: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n loggers: Optional[Dict[str, Any]] = None,\n execution: Optional[Dict[str, Any]] = None,\n ):\n self.ops = check.opt_dict_param(ops, "ops")\n self.resources = check.opt_dict_param(resources, "resources")\n self.loggers = check.opt_dict_param(loggers, "loggers")\n self.execution = check.opt_dict_param(execution, "execution")\n\n def to_config_dict(self):\n return {\n "loggers": self.loggers,\n "resources": _convert_config_classes(self.resources),\n "ops": _convert_config_classes(self.ops),\n "execution": self.execution,\n }
\n\n\nCoercibleToRunConfig: TypeAlias = Union[Dict[str, Any], RunConfig]\n\nT = TypeVar("T")\n\n\ndef convert_config_input(inp: Union[CoercibleToRunConfig, T]) -> Union[T, Mapping[str, Any]]:\n if isinstance(inp, RunConfig):\n return inp.to_config_dict()\n else:\n return inp\n
", "current_page_name": "_modules/dagster/_core/definitions/run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_config"}, "run_request": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_request

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG\nfrom dagster._serdes.serdes import whitelist_for_serdes\nfrom dagster._utils.error import SerializableErrorInfo\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\n@whitelist_for_serdes(old_storage_names={"JobType"})\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n    AUTO_MATERIALIZE = "AUTO_MATERIALIZE"\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", PublicAttr[Optional[str]])])):\n """Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in the Dagster UI for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AddDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(AddDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass DeleteDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(DeleteDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", PublicAttr[Optional[str]]),\n ("run_config", PublicAttr[Mapping[str, Any]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("asset_selection", PublicAttr[Optional[Sequence[AssetKey]]]),\n ("stale_assets_only", PublicAttr[bool]),\n ("partition_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n tags (Optional[Dict[str, Any]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n asset_selection (Optional[Sequence[AssetKey]]): A sequence of AssetKeys that should be\n launched with this run.\n stale_assets_only (bool): Set to true to further narrow the asset\n selection to stale assets. If passed without an asset selection, all stale assets in the\n job will be materialized. If the job does not materialize assets, this flag is ignored.\n partition_key (Optional[str]): The partition key for this run request.\n """\n\n def __new__(\n cls,\n run_key: Optional[str] = None,\n run_config: Optional[Union["RunConfig", Mapping[str, Any]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n job_name: Optional[str] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n stale_assets_only: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.run_config import convert_config_input\n\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n ),\n tags=validate_tags(check.opt_mapping_param(tags, "tags", key_type=str)),\n job_name=check.opt_str_param(job_name, "job_name"),\n asset_selection=check.opt_nullable_sequence_param(\n asset_selection, "asset_selection", of_type=AssetKey\n ),\n stale_assets_only=check.bool_param(stale_assets_only, "stale_assets_only"),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n )\n\n def with_replaced_attrs(self, **kwargs: Any) -> "RunRequest":\n fields = self._asdict()\n for k in fields.keys():\n if k in kwargs:\n fields[k] = kwargs[k]\n return RunRequest(**fields)\n\n def with_resolved_tags_and_config(\n self,\n target_definition: Union["JobDefinition", "UnresolvedAssetJobDefinition"],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "RunRequest":\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.definitions.partition import (\n PartitionedConfig,\n PartitionsDefinition,\n )\n\n if self.partition_key is None:\n check.failed(\n "Cannot resolve partition for run request without partition key",\n )\n\n partitions_def = target_definition.partitions_def\n if partitions_def is None:\n check.failed(\n "Cannot resolve partition for run request when target job"\n f" '{target_definition.name}' is unpartitioned.",\n )\n partitions_def = cast(PartitionsDefinition, partitions_def)\n\n partitioned_config = (\n target_definition.partitioned_config\n if isinstance(target_definition, JobDefinition)\n else PartitionedConfig.from_flexible_config(target_definition.config, partitions_def)\n )\n if partitioned_config is None:\n check.failed(\n "Cannot resolve partition for run request on unpartitioned job",\n )\n\n _check_valid_partition_key_after_dynamic_partitions_requests(\n self.partition_key,\n partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n tags = {\n **(self.tags or {}),\n **partitioned_config.get_tags_for_partition_key(\n self.partition_key,\n job_name=target_definition.name,\n ),\n }\n\n return self.with_replaced_attrs(\n run_config=(\n self.run_config\n if self.run_config\n else partitioned_config.get_run_config_for_partition_key(self.partition_key)\n ),\n tags=tags,\n )\n\n def has_resolved_partition(self) -> bool:\n # Backcompat run requests yielded via `run_request_for_partition` already have resolved\n # partitioning\n return self.tags.get(PARTITION_NAME_TAG) is not None if self.partition_key else True
\n\n\ndef _check_valid_partition_key_after_dynamic_partitions_requests(\n partition_key: str,\n partitions_def: "PartitionsDefinition",\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\n from dagster._core.definitions.partition import (\n DynamicPartitionsDefinition,\n )\n\n if isinstance(partitions_def, MultiPartitionsDefinition):\n multipartition_key = partitions_def.get_partition_key_from_str(partition_key)\n\n for dimension in partitions_def.partitions_defs:\n _check_valid_partition_key_after_dynamic_partitions_requests(\n multipartition_key.keys_by_dimension[dimension.name],\n dimension.partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n elif isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name:\n if not dynamic_partitions_store:\n check.failed(\n "Cannot resolve partition for run request on dynamic partitions without"\n " dynamic_partitions_store"\n )\n\n add_partition_keys: Set[str] = set()\n delete_partition_keys: Set[str] = set()\n for req in dynamic_partitions_requests:\n if isinstance(req, AddDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n add_partition_keys.update(set(req.partition_keys))\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n delete_partition_keys.update(set(req.partition_keys))\n\n partition_keys_after_requests_resolved = (\n set(\n dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=partitions_def.name\n )\n )\n | add_partition_keys\n ) - delete_partition_keys\n\n if partition_key not in partition_keys_after_requests_resolved:\n check.failed(\n f"Dynamic partition key {partition_key} for partitions def"\n f" '{partitions_def.name}' is invalid. After dynamic partitions requests are"\n " applied, it does not exist in the set of valid partition keys."\n )\n\n else:\n partitions_def.validate_partition_key(\n partition_key,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n\n@whitelist_for_serdes(\n storage_name="PipelineRunReaction",\n storage_field_names={\n "dagster_run": "pipeline_run",\n },\n)\nclass DagsterRunReaction(\n NamedTuple(\n "_DagsterRunReaction",\n [\n ("dagster_run", Optional[DagsterRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[DagsterRunStatus]),\n ],\n )\n):\n """Represents a request that reacts to an existing dagster run. If success, it will report logs\n back to the run.\n\n Attributes:\n dagster_run (Optional[DagsterRun]): The dagster run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[DagsterRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n dagster_run: Optional[DagsterRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[DagsterRunStatus] = None,\n ):\n return super(DagsterRunReaction, cls).__new__(\n cls,\n dagster_run=check.opt_inst_param(dagster_run, "dagster_run", DagsterRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", DagsterRunStatus),\n )\n\n\n
[docs]@experimental_param(\n param="asset_events", additional_warn_text="Runless asset events are experimental"\n)\nclass SensorResult(\n NamedTuple(\n "_SensorResult",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_reason", Optional[SkipReason]),\n ("cursor", Optional[str]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n List[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n """The result of a sensor evaluation.\n\n Attributes:\n run_requests (Optional[Sequence[RunRequest]]): A list\n of run requests to be executed.\n skip_reason (Optional[Union[str, SkipReason]]): A skip message indicating why sensor\n evaluation was skipped.\n cursor (Optional[str]): The cursor value for this sensor, which will be provided on the\n context for the next sensor evaluation.\n dynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,\n AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\n partition addition and deletion. Run requests will be evaluated using the state of the\n partitions with these changes applied.\n asset_events (Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]): (Experimental) A\n list of materializations, observations, and asset check evaluations that the system\n will persist on your behalf at the end of sensor evaluation. These events will be not\n be associated with any particular run, but will be queryable and viewable in the asset catalog.\n\n\n """\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_reason: Optional[Union[str, SkipReason]] = None,\n cursor: Optional[str] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]\n ] = None,\n ):\n if skip_reason and len(run_requests if run_requests else []) > 0:\n check.failed(\n "Expected a single skip reason or one or more run requests: received values for "\n "both run_requests and skip_reason"\n )\n\n skip_reason = check.opt_inst_param(skip_reason, "skip_reason", (SkipReason, str))\n if isinstance(skip_reason, str):\n skip_reason = SkipReason(skip_reason)\n\n return super(SensorResult, cls).__new__(\n cls,\n run_requests=check.opt_sequence_param(run_requests, "run_requests", RunRequest),\n skip_reason=skip_reason,\n cursor=check.opt_str_param(cursor, "cursor"),\n dynamic_partitions_requests=check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n ),\n asset_events=list(\n check.opt_sequence_param(\n asset_events,\n "asset_check_evaluations",\n (AssetObservation, AssetMaterialization, AssetCheckEvaluation),\n )\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_request", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_status_sensor_definition

\nimport functools\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvariantViolationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent, DagsterEventType\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus, RunsFilter\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .sensor_definition import (\n    DagsterRunReaction,\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    RunRequest,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorResult,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.resource_definition import ResourceDefinition\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nRunStatusSensorEvaluationFunction: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\nRunFailureSensorEvaluationFn: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\n\n\n@whitelist_for_serdes(old_storage_names={"PipelineSensorCursor"})\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_value(json_str, RunStatusSensorCursor)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "RunStatusSensorCursor":\n        return deserialize_value(json_str, RunStatusSensorCursor)\n\n\n
[docs]class RunStatusSensorContext:\n """The ``context`` object available to a decorated function of ``run_status_sensor``."""\n\n def __init__(\n self,\n sensor_name,\n dagster_run,\n dagster_event,\n instance,\n context: Optional[\n SensorEvaluationContext\n ] = None, # deprecated arg, but we need to keep it for backcompat\n resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n logger: Optional[logging.Logger] = None,\n partition_key: Optional[str] = None,\n _resources: Optional[Resources] = None,\n _cm_scope_entered: bool = False,\n ) -> None:\n self._exit_stack = ExitStack()\n self._sensor_name = check.str_param(sensor_name, "sensor_name")\n self._dagster_run = check.inst_param(dagster_run, "dagster_run", DagsterRun)\n self._dagster_event = check.inst_param(dagster_event, "dagster_event", DagsterEvent)\n self._instance = check.inst_param(instance, "instance", DagsterInstance)\n self._logger: Optional[logging.Logger] = logger or (context.log if context else None)\n self._partition_key = check.opt_str_param(partition_key, "partition_key")\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resource_defs\n self._resources = _resources\n self._cm_scope_entered = _cm_scope_entered\n\n def for_run_failure(self) -> "RunFailureSensorContext":\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self._sensor_name,\n dagster_run=self._dagster_run,\n dagster_event=self._dagster_event,\n instance=self._instance,\n logger=self._logger,\n partition_key=self._partition_key,\n resource_defs=self._resource_defs,\n _resources=self._resources,\n _cm_scope_entered=self._cm_scope_entered,\n )\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @property\n def resources(self) -> Resources:\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n instance = self.instance if self._instance else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def sensor_name(self) -> str:\n """The name of the sensor."""\n return self._sensor_name\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """The run of the job."""\n return self._dagster_run\n\n @public\n @property\n def dagster_event(self) -> DagsterEvent:\n """The event associated with the job run status."""\n return self._dagster_event\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """The current instance."""\n return self._instance\n\n @public\n @property\n def log(self) -> logging.Logger:\n """The logger for the current sensor evaluation."""\n if not self._logger:\n self._logger = InstigationLogger()\n\n return self._logger\n\n @public\n @property\n def partition_key(self) -> Optional[str]:\n """Optional[str]: The partition key of the relevant run."""\n return self._partition_key\n\n def __enter__(self) -> "RunStatusSensorContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the failed run.\n """\n\n @public\n @property\n def failure_event(self) -> DagsterEvent:\n """The run failure event.\n\n If the run failed because of an error inside a step, get_step_failure_events will have more\n details on the step failure.\n """\n return self.dagster_event\n\n
[docs] @public\n def get_step_failure_events(self) -> Sequence[DagsterEvent]:\n """The step failure event for each step in the run that failed.\n\n Examples:\n .. code-block:: python\n\n error_strings_by_step_key = {\n # includes the stack trace\n event.step_key: event.event_specific_data.error.to_string()\n for event in context.get_step_failure_events()\n }\n """\n records = self.instance.get_records_for_run(\n run_id=self.dagster_run.run_id, of_type=DagsterEventType.STEP_FAILURE\n ).records\n return [cast(DagsterEvent, record.event_log_entry.dagster_event) for record in records]
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n context: Optional[SensorEvaluationContext] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n) -> RunStatusSensorContext:\n """Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n resources (Optional[Mapping[str, object]]): A dictionary of resources to be made available\n to the sensor.\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n resource_defs=wrap_resources_for_execution(resources),\n logger=context.log if context else None,\n partition_key=partition_key,\n )
\n\n\n@overload\ndef run_failure_sensor(\n name: RunFailureSensorEvaluationFn,\n) -> SensorDefinition: ...\n\n\n@overload\ndef run_failure_sensor(\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]: ...\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_failure_sensor(\n name: Optional[Union[RunFailureSensorEvaluationFn, str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Union[SensorDefinition, Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]]:\n """Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this failure sensor.\n Defaults to None, which means the alert will be sent when any job in the current\n repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\n monitored by this failure sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunFailureSensorEvaluationFn,\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_status_sensor(\n run_status=DagsterRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n @functools.wraps(fn)\n def _run_failure_sensor(*args, **kwargs) -> Any:\n args_modified = [\n arg.for_run_failure() if isinstance(arg, RunStatusSensorContext) else arg\n for arg in args\n ]\n kwargs_modified = {\n k: v.for_run_failure() if isinstance(v, RunStatusSensorContext) else v\n for k, v in kwargs.items()\n }\n return fn(*args_modified, **kwargs_modified)\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """Define a sensor that reacts to a given status of job execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n run_status (DagsterRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this sensor. Defaults to\n None, which means the alert will be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def __init__(\n self,\n name: str,\n run_status: DagsterRunStatus,\n run_status_sensor_fn: RunStatusSensorEvaluationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._core.definitions.selector import (\n CodeLocationSelector,\n JobSelector,\n RepositorySelector,\n )\n from dagster._core.event_api import RunShardedEventsCursor\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n check.str_param(name, "name")\n check.inst_param(run_status, "run_status", DagsterRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(\n monitored_jobs,\n "monitored_jobs",\n (\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n RepositorySelector,\n JobSelector,\n CodeLocationSelector,\n ),\n )\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(run_status_sensor_fn)}\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n # coerce CodeLocationSelectors to RepositorySelectors with repo name "__repository__"\n monitored_jobs = [\n job.to_repository_selector() if isinstance(job, CodeLocationSelector) else job\n for job in (monitored_jobs or [])\n ]\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n event_type = PIPELINE_RUN_STATUS_TO_EVENT_TYPE[run_status]\n\n # split monitored_jobs into external repos, external jobs, and jobs in the current repo\n other_repos = (\n [x for x in monitored_jobs if isinstance(x, RepositorySelector)]\n if monitored_jobs\n else []\n )\n\n other_repo_jobs = (\n [x for x in monitored_jobs if isinstance(x, JobSelector)] if monitored_jobs else []\n )\n\n current_repo_jobs = (\n [x for x in monitored_jobs if not isinstance(x, (JobSelector, RepositorySelector))]\n if monitored_jobs\n else []\n )\n\n def _wrapped_fn(\n context: SensorEvaluationContext,\n ) -> Iterator[Union[RunRequest, SkipReason, DagsterRunReaction, SensorResult]]:\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(\n EventRecordsFilter(event_type=event_type), ascending=False, limit=1\n )\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=event_type,\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n dagster_run = run_records[0].dagster_run\n update_timestamp = run_records[0].update_timestamp\n\n job_match = False\n\n # if monitor_all_repositories is provided, then we want to run the sensor for all jobs in all repositories\n if monitor_all_repositories:\n job_match = True\n\n # check if the run is in the current repository and (if provided) one of jobs specified in monitored_jobs\n if (\n not job_match\n and\n # the job has a repository (not manually executed)\n dagster_run.external_job_origin\n and\n # the job belongs to the current repository\n dagster_run.external_job_origin.external_repository_origin.repository_name\n == context.repository_name\n ):\n if monitored_jobs:\n if dagster_run.job_name in map(lambda x: x.name, current_repo_jobs):\n job_match = True\n else:\n job_match = True\n\n if not job_match:\n # check if the run is one of the jobs specified by JobSelector or RepositorySelector (ie in another repo)\n # make a JobSelector for the run in question\n external_repository_origin = check.not_none(\n dagster_run.external_job_origin\n ).external_repository_origin\n run_job_selector = JobSelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n job_name=dagster_run.job_name,\n )\n if run_job_selector in other_repo_jobs:\n job_match = True\n\n # make a RepositorySelector for the run in question\n run_repo_selector = RepositorySelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n )\n if run_repo_selector in other_repos:\n job_match = True\n\n if not job_match:\n # the run in question doesn't match any of the criteria for we advance the cursor and move on\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n try:\n with RunStatusSensorContext(\n sensor_name=name,\n dagster_run=dagster_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n resource_defs=context.resource_defs,\n logger=context.log,\n partition_key=dagster_run.tags.get("dagster/partition"),\n ) as sensor_context, user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n context_param_name = get_context_param_name(run_status_sensor_fn)\n context_param = (\n {context_param_name: sensor_context} if context_param_name else {}\n )\n\n sensor_return = run_status_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n\n if sensor_return is not None:\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=update_timestamp.isoformat(),\n ).to_json()\n )\n\n if isinstance(sensor_return, SensorResult):\n if sensor_return.cursor:\n raise DagsterInvariantViolationError(\n f"Error in run status sensor {name}: Sensor returned a"\n " SensorResult with a cursor value. The cursor is managed"\n " by the sensor and should not be modified by a user."\n )\n yield sensor_return\n elif isinstance(\n sensor_return,\n (RunRequest, SkipReason, DagsterRunReaction),\n ):\n yield sensor_return\n else:\n yield from sensor_return\n return\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield DagsterRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield DagsterRunReaction(\n dagster_run=dagster_run,\n run_status=run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n job=request_job,\n jobs=request_jobs,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name = get_context_param_name(self._run_status_sensor_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._run_status_sensor_fn,\n args,\n kwargs,\n context_type=RunStatusSensorContext,\n )\n context_param = {context_param_name: context} if context_param_name and context else {}\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n return self._run_status_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.RUN_STATUS
\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_status_sensor(\n run_status: DagsterRunStatus,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunStatusSensorEvaluationFunction], RunStatusSensorDefinition,]:\n """Creates a sensor that reacts to a given status of job execution, where the decorated\n function will be run when a job is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n run_status (DagsterRunStatus): The status of run execution which will be\n monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\n be sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\n RepositorySelector or JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the Dagster instance.\n If set to True, an error will be raised if you also specify monitored_jobs or job_selection.\n Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\n monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository matches the requested run_status.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job that should be\n executed if a RunRequest is yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunStatusSensorEvaluationFunction,\n ) -> RunStatusSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n if jobs and monitor_all_repositories:\n DagsterInvalidDefinitionError(\n "Cannot specify both monitor_all_repositories and"\n f" {'monitored_jobs' if monitored_jobs else 'job_selection'}."\n )\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n run_status=run_status,\n run_status_sensor_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_status_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.schedule_definition

\nimport copy\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, ensure_gen\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom ..decorator_utils import has_at_least_one_parameter\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.dagster_run import DagsterRun\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name, validate_tags\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\nT = TypeVar("T")\n\nRunConfig: TypeAlias = Mapping[str, Any]\nRunRequestIterator: TypeAlias = Iterator[Union[RunRequest, SkipReason]]\n\nScheduleEvaluationFunctionReturn: TypeAlias = Union[\n    RunRequest, SkipReason, RunConfig, RunRequestIterator, Sequence[RunRequest]\n]\nRawScheduleEvaluationFunction: TypeAlias = Callable[..., ScheduleEvaluationFunctionReturn]\n\nScheduleRunConfigFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], RunConfig],\n    Callable[[], RunConfig],\n]\n\nScheduleTagsFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], Mapping[str, str]]\nScheduleShouldExecuteFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], bool]\nScheduleExecutionFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], Any],\n    "DecoratedScheduleFunction",\n]\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\ndef get_or_create_schedule_context(\n    fn: Callable, *args: Any, **kwargs: Any\n) -> "ScheduleEvaluationContext":\n    """Based on the passed resource function and the arguments passed to it, returns the\n    user-passed ScheduleEvaluationContext or creates one if it is not passed.\n\n    Raises an exception if the user passes more than one argument or if the user-provided\n    function requires a context parameter but none is passed.\n    """\n    from dagster._config.pythonic_config import is_coercible_to_resource\n    from dagster._core.definitions.sensor_definition import get_context_param_name\n\n    context_param_name = get_context_param_name(fn)\n\n    kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n    if len(args) + len(kwarg_keys_non_resource) > 1:\n        raise DagsterInvalidInvocationError(\n            "Schedule invocation received multiple non-resource arguments. Only a first "\n            "positional context parameter should be provided when invoking."\n        )\n\n    if any(is_coercible_to_resource(arg) for arg in args):\n        raise DagsterInvalidInvocationError(\n            "If directly invoking a schedule, you may not provide resources as"\n            " positional arguments, only as keyword arguments."\n        )\n\n    context: Optional[ScheduleEvaluationContext] = None\n\n    if len(args) > 0:\n        context = check.opt_inst(args[0], ScheduleEvaluationContext)\n    elif len(kwargs) > 0:\n        if context_param_name and context_param_name not in kwargs:\n            raise DagsterInvalidInvocationError(\n                f"Schedule invocation expected argument '{context_param_name}'."\n            )\n        context = check.opt_inst(\n            kwargs.get(context_param_name or "context"), ScheduleEvaluationContext\n        )\n    elif context_param_name:\n        # If the context parameter is present but no value was provided, we error\n        raise DagsterInvalidInvocationError(\n            "Schedule evaluation function expected context argument, but no context argument "\n            "was provided when invoking."\n        )\n\n    context = context or build_schedule_context()\n    resource_args_from_kwargs = {}\n\n    resource_args = {param.name for param in get_resource_args(fn)}\n    for resource_arg in resource_args:\n        if resource_arg in kwargs:\n            resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n    if resource_args_from_kwargs:\n        return context.merge_resources(resource_args_from_kwargs)\n\n    return context\n\n\n
[docs]class ScheduleEvaluationContext:\n """The context object available as the first argument various functions defined on a :py:class:`dagster.ScheduleDefinition`.\n\n A `ScheduleEvaluationContext` object is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Users should not instantiate this object directly. To construct a `ScheduleEvaluationContext` for testing purposes, use :py:func:`dagster.build_schedule_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import schedule, ScheduleEvaluationContext\n\n @schedule\n def the_schedule(context: ScheduleEvaluationContext):\n ...\n\n """\n\n __slots__ = [\n "_instance_ref",\n "_scheduled_execution_time",\n "_exit_stack",\n "_instance",\n "_log_key",\n "_logger",\n "_repository_name",\n "_resource_defs",\n "_schedule_name",\n "_resources_cm",\n "_resources",\n "_cm_scope_entered",\n "_repository_def",\n ]\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n scheduled_execution_time: Optional[datetime],\n repository_name: Optional[str] = None,\n schedule_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n ):\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n self._log_key = (\n [\n repository_name,\n schedule_name,\n scheduled_execution_time.strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and schedule_name and scheduled_execution_time\n else None\n )\n self._logger = None\n self._repository_name = repository_name\n self._schedule_name = schedule_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n self._repository_def = check.opt_inst_param(\n repository_def, "repository_def", RepositoryDefinition\n )\n\n def __enter__(self) -> "ScheduleEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @public\n @property\n def resources(self) -> Resources:\n """Mapping of resource key to resource definition to be made available\n during schedule execution.\n """\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on schedules they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_sensor_context(...) as context:`"\n )\n\n return self._resources\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "ScheduleEvaluationContext":\n """Merge the specified resources into this context.\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return ScheduleEvaluationContext(\n instance_ref=self._instance_ref,\n scheduled_execution_time=self._scheduled_execution_time,\n repository_name=self._repository_name,\n schedule_name=self._schedule_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n repository_def=self._repository_def,\n )\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n """The serialized instance configured to run the schedule."""\n return self._instance_ref\n\n @public\n @property\n def scheduled_execution_time(self) -> datetime:\n """The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n """\n if self._scheduled_execution_time is None:\n check.failed(\n "Attempting to access scheduled_execution_time, but no scheduled_execution_time was"\n " set on this context"\n )\n\n return self._scheduled_execution_time\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n return cast(InstigationLogger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key\n\n @property\n def repository_def(self) -> "RepositoryDefinition":\n if not self._repository_def:\n raise DagsterInvariantViolationError(\n "Attempted to access repository_def, but no repository_def was provided."\n )\n return self._repository_def
\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n """Wrapper around the decorated schedule function. Keeps track of both to better support the\n optimal return value for direct invocation of the evaluation function.\n """\n\n decorated_fn: RawScheduleEvaluationFunction\n wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]\n has_context_arg: bool\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None,\n scheduled_execution_time: Optional[datetime] = None,\n resources: Optional[Mapping[str, object]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n .. code-block:: python\n\n context = build_schedule_context(instance)\n\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n\n return ScheduleEvaluationContext(\n instance_ref=(\n instance_ref\n if instance_ref\n else instance.get_ref() if instance and instance.is_persistent else None\n ),\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n resources=wrap_resources_for_execution(resources),\n repository_def=repository_def,\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(\n NamedTuple(\n "_ScheduleExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("captured_log_key", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(ScheduleExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n captured_log_key=captured_log_key,\n )\n\n\ndef validate_and_get_schedule_resource_dict(\n resources: Resources, schedule_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by schedule '{schedule_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\n
[docs]@deprecated_param(\n param="environment_vars",\n breaking_version="2.0",\n additional_warn_text=(\n "It is no longer necessary. Schedules will have access to all environment variables set in"\n " the containing environment, and can safely be deleted."\n ),\n)\nclass ScheduleDefinition(IHasInternalInit):\n """Define a schedule that targets a job.\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Mapping]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "ScheduleDefinition":\n """Returns a copy of this schedule with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return ScheduleDefinition.dagster_internal_init(\n name=self.name,\n cron_schedule=self._cron_schedule,\n job_name=self.job_name,\n execution_timezone=self.execution_timezone,\n execution_fn=self._execution_fn,\n description=self.description,\n job=new_job,\n default_status=self.default_status,\n environment_vars=self._environment_vars,\n required_resource_keys=self._raw_required_resource_keys,\n run_config=None, # run_config, tags, should_execute encapsulated in execution_fn\n run_config_fn=None,\n tags=None,\n tags_fn=None,\n should_execute=None,\n )\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n cron_schedule: Optional[Union[str, Sequence[str]]] = None,\n job_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[ScheduleRunConfigFunction] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[ScheduleTagsFunction] = None,\n should_execute: Optional[ScheduleShouldExecuteFunction] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[ScheduleExecutionFunction] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._cron_schedule = check.inst_param(cron_schedule, "cron_schedule", (str, Sequence))\n if not isinstance(self._cron_schedule, str):\n check.sequence_param(self._cron_schedule, "cron_schedule", of_type=str) # type: ignore\n\n if not is_valid_cron_schedule(self._cron_schedule): # type: ignore\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n\n if name:\n self._name = check_valid_name(name)\n elif job_name:\n self._name = job_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_mapping_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[Union[Callable[..., Any], DecoratedScheduleFunction]] = (\n None\n )\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n\n def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:\n return check.opt_dict_param(run_config, "run_config")\n\n self._run_config_fn = check.opt_callable_param(\n run_config_fn, "run_config_fn", default=_default_run_config_fn\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n tags = validate_tags(tags, allow_reserved_tags=False)\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(\n tags_fn, "tags_fn", default=lambda _context: cast(Mapping[str, str], {})\n )\n self._tags_fn = tags_fn\n self._tags = tags\n\n self._should_execute: ScheduleShouldExecuteFunction = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n # Several type-ignores are present in this function to work around bugs in mypy\n # inference.\n def _execution_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of should_execute for schedule {name}"\n ),\n ):\n if not self._should_execute(context):\n yield SkipReason(f"should_execute function for {name} returned false.")\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of run_config_fn for schedule {name}"\n ),\n ):\n _run_config_fn = check.not_none(self._run_config_fn)\n evaluated_run_config = copy.deepcopy(\n _run_config_fn(context)\n if has_at_least_one_parameter(_run_config_fn)\n else _run_config_fn() # type: ignore # (strict type guard)\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(self._execution_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n f"Invalid execution timezone {self._execution_timezone} for {name}"\n ) from e\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n resource_arg_names: Set[str] = (\n {arg.name for arg in get_resource_args(self._execution_fn.decorated_fn)}\n if isinstance(self._execution_fn, DecoratedScheduleFunction)\n else set()\n )\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @schedule decorator and as arguments to"\n " the decorated function",\n )\n\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n cron_schedule: Optional[Union[str, Sequence[str]]],\n job_name: Optional[str],\n run_config: Optional[Any],\n run_config_fn: Optional[ScheduleRunConfigFunction],\n tags: Optional[Mapping[str, str]],\n tags_fn: Optional[ScheduleTagsFunction],\n should_execute: Optional[ScheduleShouldExecuteFunction],\n environment_vars: Optional[Mapping[str, str]],\n execution_timezone: Optional[str],\n execution_fn: Optional[ScheduleExecutionFunction],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n default_status: DefaultScheduleStatus,\n required_resource_keys: Optional[Set[str]],\n ) -> "ScheduleDefinition":\n return ScheduleDefinition(\n name=name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n run_config=run_config,\n run_config_fn=run_config_fn,\n tags=tags,\n tags_fn=tags_fn,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> ScheduleEvaluationFunctionReturn:\n from dagster._core.definitions.sensor_definition import get_context_param_name\n\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n\n context_param_name = get_context_param_name(self._execution_fn.decorated_fn)\n context = get_or_create_schedule_context(self._execution_fn.decorated_fn, *args, **kwargs)\n context_param = {context_param_name: context} if context_param_name else {}\n\n resources = validate_and_get_schedule_resource_dict(\n context.resources, self._name, self._required_resource_keys\n )\n result = self._execution_fn.decorated_fn(**context_param, **resources)\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the schedule."""\n return self._name\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the job targeted by this schedule."""\n return self._target.job_name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this schedule."""\n return self._description\n\n @public\n @property\n def cron_schedule(self) -> Union[str, Sequence[str]]:\n """Union[str, Sequence[str]]: The cron schedule representing when this schedule will be evaluated."""\n return self._cron_schedule # type: ignore\n\n @public\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Setting this property no longer has any effect.",\n )\n @property\n def environment_vars(self) -> Mapping[str, str]:\n """Mapping[str, str]: Environment variables to export to the cron schedule."""\n return self._environment_vars\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this schedule."""\n return self._required_resource_keys\n\n @public\n @property\n def execution_timezone(self) -> Optional[str]:\n """Optional[str]: The timezone in which this schedule will be evaluated."""\n return self._execution_timezone\n\n @public\n @property\n def job(self) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if isinstance(self._target, DirectTarget):\n return self._target.target\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n from dagster._core.definitions.partition import CachingDynamicPartitionsLoader\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n execution_fn: Callable[..., "ScheduleEvaluationFunctionReturn"]\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(\n Callable[..., "ScheduleEvaluationFunctionReturn"],\n self._execution_fn,\n )\n\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest] = []\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = check.inst(result[0], (SkipReason, RunRequest))\n if isinstance(item, RunRequest):\n run_requests = [item]\n skip_message = None\n elif isinstance(item, SkipReason):\n run_requests = []\n skip_message = item.skip_message\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest))\n check.invariant(\n not any(not request.run_key for request in result),\n "Schedules that return multiple RunRequests must specify a run_key in each"\n " RunRequest",\n )\n run_requests = result\n skip_message = None\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # clone all the run requests with resolved tags and config\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.partition_key and not run_request.has_resolved_partition():\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_schedule_context when yielding"\n " partitioned run requests"\n )\n\n scheduled_target = context.repository_def.get_job(self._target.job_name)\n resolved_request = run_request.with_resolved_tags_and_config(\n target_definition=scheduled_target,\n dynamic_partitions_requests=[],\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n else:\n resolved_request = run_request\n\n resolved_run_requests.append(\n resolved_request.with_replaced_attrs(\n tags=merge_dicts(resolved_request.tags, DagsterRun.tags_for_schedule(self))\n )\n )\n\n return ScheduleExecutionData(\n run_requests=resolved_run_requests,\n skip_message=skip_message,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n @property\n def targets_unresolved_asset_job(self) -> bool:\n return self.has_loadable_target() and isinstance(\n self.load_target(), UnresolvedAssetJobDefinition\n )\n\n def load_target(\n self,\n ) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @public\n @property\n def default_status(self) -> DefaultScheduleStatus:\n """DefaultScheduleStatus: The default status for this schedule when it is first loaded in\n a code location.\n """\n return self._default_status
\n
", "current_page_name": "_modules/dagster/_core/definitions/schedule_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.schedule_definition"}, "selector": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.selector

\nfrom typing import AbstractSet, Iterable, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.repository_definition import SINGLETON_REPOSITORY_NAME\nfrom dagster._serdes import create_snapshot_id, whitelist_for_serdes\n\n\nclass JobSubsetSelector(\n    NamedTuple(\n        "_JobSubsetSelector",\n        [\n            ("location_name", str),\n            ("repository_name", str),\n            ("job_name", str),\n            ("op_selection", Optional[Sequence[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    )\n):\n    """The information needed to resolve a job within a host process."""\n\n    def __new__(\n        cls,\n        location_name: str,\n        repository_name: str,\n        job_name: str,\n        op_selection: Optional[Sequence[str]],\n        asset_selection: Optional[Iterable[AssetKey]] = None,\n        asset_check_selection: Optional[Iterable[AssetCheckKey]] = None,\n    ):\n        asset_selection = set(asset_selection) if asset_selection else None\n        asset_check_selection = (\n            set(asset_check_selection) if asset_check_selection is not None else None\n        )\n        return super(JobSubsetSelector, cls).__new__(\n            cls,\n            location_name=check.str_param(location_name, "location_name"),\n            repository_name=check.str_param(repository_name, "repository_name"),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_sequence_param(op_selection, "op_selection", str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def to_graphql_input(self):\n        return {\n            "repositoryLocationName": self.location_name,\n            "repositoryName": self.repository_name,\n            "pipelineName": self.job_name,\n            "solidSelection": self.op_selection,\n        }\n\n    def with_op_selection(self, op_selection: Optional[Sequence[str]]) -> Self:\n        check.invariant(\n            self.op_selection is None,\n            f"Can not invoke with_op_selection when op_selection={self.op_selection} is"\n            " already set",\n        )\n        return JobSubsetSelector(\n            self.location_name, self.repository_name, self.job_name, op_selection\n        )\n\n\n
[docs]@whitelist_for_serdes\nclass JobSelector(\n NamedTuple(\n "_JobSelector", [("location_name", str), ("repository_name", str), ("job_name", str)]\n )\n):\n def __new__(\n cls,\n location_name: str,\n repository_name: Optional[str] = None,\n job_name: Optional[str] = None,\n ):\n return super(JobSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.opt_str_param(\n repository_name,\n "repository_name",\n default=SINGLETON_REPOSITORY_NAME,\n ),\n job_name=check.str_param(\n job_name,\n "job_name",\n "Must provide job_name argument even though it is marked as optional in the "\n "function signature. repository_name, a truly optional parameter, is before "\n "that argument and actually optional. Use of keyword arguments is "\n "recommended to avoid confusion.",\n ),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "jobName": self.job_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return JobSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n job_name=graphql_data["jobName"],\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RepositorySelector(\n NamedTuple("_RepositorySelector", [("location_name", str), ("repository_name", str)])\n):\n def __new__(cls, location_name: str, repository_name: str):\n return super(RepositorySelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return RepositorySelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n )
\n\n\nclass CodeLocationSelector(NamedTuple("_CodeLocationSelector", [("location_name", str)])):\n def __new__(cls, location_name: str):\n return super(CodeLocationSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n )\n\n def to_repository_selector(self) -> RepositorySelector:\n return RepositorySelector(\n location_name=self.location_name, repository_name=SINGLETON_REPOSITORY_NAME\n )\n\n\nclass ScheduleSelector(\n NamedTuple(\n "_ScheduleSelector",\n [("location_name", str), ("repository_name", str), ("schedule_name", str)],\n )\n):\n def __new__(cls, location_name: str, repository_name: str, schedule_name: str):\n return super(ScheduleSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n schedule_name=check.str_param(schedule_name, "schedule_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "scheduleName": self.schedule_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ScheduleSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n schedule_name=graphql_data["scheduleName"],\n )\n\n\nclass ResourceSelector(NamedTuple):\n location_name: str\n repository_name: str\n resource_name: str\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "resourceName": self.resource_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ResourceSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n resource_name=graphql_data["resourceName"],\n )\n\n\nclass SensorSelector(\n NamedTuple(\n "_SensorSelector", [("location_name", str), ("repository_name", str), ("sensor_name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, sensor_name: str):\n return super(SensorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "sensorName": self.sensor_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return SensorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n sensor_name=graphql_data["sensorName"],\n )\n\n\n@whitelist_for_serdes\nclass InstigatorSelector(\n NamedTuple(\n "_InstigatorSelector", [("location_name", str), ("repository_name", str), ("name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, name: str):\n return super(InstigatorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n name=check.str_param(name, "name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "name": self.name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return InstigatorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n name=graphql_data["name"],\n )\n\n\nclass GraphSelector(\n NamedTuple(\n "_GraphSelector", [("location_name", str), ("repository_name", str), ("graph_name", str)]\n )\n):\n """The information needed to resolve a graph within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, graph_name: str):\n return super(GraphSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n graph_name=check.str_param(graph_name, "graph_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "graphName": self.graph_name,\n }\n\n\n@whitelist_for_serdes\nclass PartitionSetSelector(\n NamedTuple(\n "_PartitionSetSelector",\n [("location_name", str), ("repository_name", str), ("partition_set_name", str)],\n )\n):\n """The information needed to resolve a partition set within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, partition_set_name: str):\n return super(PartitionSetSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n partition_set_name=check.str_param(partition_set_name, "partition_set_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "partitionSetName": self.partition_set_name,\n }\n\n\nclass PartitionRangeSelector(\n NamedTuple(\n "_PartitionRangeSelector",\n [("start", str), ("end", str)],\n )\n):\n """The information needed to resolve a partition range."""\n\n def __new__(cls, start: str, end: str):\n return super(PartitionRangeSelector, cls).__new__(\n cls,\n start=check.inst_param(start, "start", str),\n end=check.inst_param(end, "end", str),\n )\n\n def to_graphql_input(self):\n return {\n "start": self.start,\n "end": self.end,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionRangeSelector(\n start=graphql_data["start"],\n end=graphql_data["end"],\n )\n\n\nclass PartitionsSelector(\n NamedTuple(\n "_PartitionsSelector",\n [("partition_range", PartitionRangeSelector)],\n )\n):\n """The information needed to define selection partitions.\n Using partition_range as property name to avoid shadowing Python 'range' builtin .\n """\n\n def __new__(cls, partition_range: PartitionRangeSelector):\n return super(PartitionsSelector, cls).__new__(\n cls,\n partition_range=check.inst_param(partition_range, "range", PartitionRangeSelector),\n )\n\n def to_graphql_input(self):\n return {\n "range": self.partition_range.to_graphql_input(),\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionsSelector(\n partition_range=PartitionRangeSelector.from_graphql_input(graphql_data["range"])\n )\n\n\nclass PartitionsByAssetSelector(\n NamedTuple(\n "PartitionsByAssetSelector",\n [\n ("asset_key", AssetKey),\n ("partitions", Optional[PartitionsSelector]),\n ],\n )\n):\n """The information needed to define partitions selection for a given asset key."""\n\n def __new__(cls, asset_key: AssetKey, partitions: Optional[PartitionsSelector] = None):\n return super(PartitionsByAssetSelector, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partitions=check.opt_inst_param(partitions, "partitions", PartitionsSelector),\n )\n\n def to_graphql_input(self):\n return {\n "assetKey": self.asset_key.to_graphql_input(),\n "partitions": self.partitions.to_graphql_input() if self.partitions else None,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n asset_key = graphql_data["assetKey"]\n partitions = graphql_data.get("partitions")\n return PartitionsByAssetSelector(\n asset_key=AssetKey.from_graphql_input(asset_key),\n partitions=PartitionsSelector.from_graphql_input(partitions) if partitions else None,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/selector", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.selector"}, "sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.sensor_definition

\nimport inspect\nimport logging\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n)\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.partition import (\n    CachingDynamicPartitionsLoader,\n)\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.resource_definition import (\n    Resources,\n)\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, normalize_to_repository\n\nfrom ..decorator_utils import (\n    get_function_params,\n)\nfrom .asset_selection import AssetSelection\nfrom .graph_definition import GraphDefinition\nfrom .run_request import (\n    AddDynamicPartitionsRequest,\n    DagsterRunReaction,\n    DeleteDynamicPartitionsRequest,\n    RunRequest,\n    SensorResult,\n    SkipReason,\n)\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n@whitelist_for_serdes\nclass SensorType(Enum):\n    STANDARD = "STANDARD"\n    RUN_STATUS = "RUN_STATUS"\n    ASSET = "ASSET"\n    MULTI_ASSET = "MULTI_ASSET"\n    FRESHNESS_POLICY = "FRESHNESS_POLICY"\n    UNKNOWN = "UNKNOWN"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\n
[docs]class SensorEvaluationContext:\n """The context object available as the argument to the evaluation function of a :py:class:`dagster.SensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `SensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_sensor_context`.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository or that\n the sensor belongs to. If needed by the sensor top-level resource definitions will be\n pulled from this repository. You can provide either this or `definitions`.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n resources (Optional[Dict[str, Any]]): A dict of resource keys to resource\n definitions to be made available during sensor execution.\n\n Example:\n .. code-block:: python\n\n from dagster import sensor, SensorEvaluationContext\n\n @sensor\n def the_sensor(context: SensorEvaluationContext):\n ...\n\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n self._sensor_name = sensor_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n\n self._log_key = (\n [\n repository_name,\n sensor_name,\n pendulum.now("UTC").strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and sensor_name\n else None\n )\n self._logger: Optional[InstigationLogger] = None\n self._cursor_updated = False\n\n def __enter__(self) -> "SensorEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "SensorEvaluationContext":\n """Merge the specified resources into this context.\n\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return SensorEvaluationContext(\n instance_ref=self._instance_ref,\n last_completion_time=self._last_completion_time,\n last_run_key=self._last_run_key,\n cursor=self._cursor,\n repository_name=self._repository_name,\n repository_def=self._repository_def,\n instance=self._instance,\n sensor_name=self._sensor_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n )\n\n @public\n @property\n def resources(self) -> Resources:\n """Resources: A mapping from resource key to instantiated resources for this sensor."""\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on sensors they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was"\n " provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n return self._instance_ref\n\n @public\n @property\n def last_completion_time(self) -> Optional[float]:\n """Optional[float]: Timestamp representing the last time this sensor completed an evaluation."""\n return self._last_completion_time\n\n @public\n @property\n def last_run_key(self) -> Optional[str]:\n """Optional[str]: The run key supplied to the most recent RunRequest produced by this sensor."""\n return self._last_run_key\n\n @public\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n
[docs] @public\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._cursor_updated = True
\n\n @property\n def cursor_updated(self) -> bool:\n return self._cursor_updated\n\n @public\n @property\n def repository_name(self) -> Optional[str]:\n """Optional[str]: The name of the repository that this sensor resides in."""\n return self._repository_name\n\n @public\n @property\n def repository_def(self) -> Optional["RepositoryDefinition"]:\n """Optional[RepositoryDefinition]: The RepositoryDefinition that this sensor resides in."""\n return self._repository_def\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key
\n\n\nRawSensorEvaluationFunctionReturn = Union[\n Iterator[Union[SkipReason, RunRequest, DagsterRunReaction, SensorResult]],\n Sequence[RunRequest],\n SkipReason,\n RunRequest,\n DagsterRunReaction,\n SensorResult,\n]\nRawSensorEvaluationFunction: TypeAlias = Callable[..., RawSensorEvaluationFunctionReturn]\n\nSensorEvaluationFunction: TypeAlias = Callable[..., Sequence[Union[SkipReason, RunRequest]]]\n\n\ndef get_context_param_name(fn: Callable) -> Optional[str]:\n """Determines the sensor's context parameter name by excluding all resource parameters."""\n resource_params = {param.name for param in get_resource_args(fn)}\n\n return next(\n (param.name for param in get_function_params(fn) if param.name not in resource_params), None\n )\n\n\ndef validate_and_get_resource_dict(\n resources: Resources, sensor_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by sensor '{sensor_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\ndef _check_dynamic_partitions_requests(\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n) -> None:\n req_keys_to_add_by_partitions_def_name = defaultdict(set)\n req_keys_to_delete_by_partitions_def_name = defaultdict(set)\n\n for req in dynamic_partitions_requests:\n duplicate_req_keys_to_delete = req_keys_to_delete_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n duplicate_req_keys_to_add = req_keys_to_add_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n if isinstance(req, AddDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_delete}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_add_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys:"\n f" {req_keys_to_add_by_partitions_def_name}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_delete_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n else:\n check.failed(f"Unexpected dynamic partition request type: {req}")\n\n\n
[docs]class SensorDefinition(IHasInternalInit):\n """Define a sensor that initiates a set of runs based on some external state.\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def with_updated_jobs(self, new_jobs: Sequence[ExecutableDefinition]) -> "SensorDefinition":\n """Returns a copy of this sensor with the jobs replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return SensorDefinition.dagster_internal_init(\n name=self.name,\n evaluation_fn=self._raw_fn,\n minimum_interval_seconds=self.minimum_interval_seconds,\n description=self.description,\n job_name=None, # if original init was passed job name, was resolved to a job\n jobs=new_jobs if len(new_jobs) > 1 else None,\n job=new_jobs[0] if len(new_jobs) == 1 else None,\n default_status=self.default_status,\n asset_selection=self.asset_selection,\n required_resource_keys=self._raw_required_resource_keys,\n )\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "SensorDefinition":\n """Returns a copy of this sensor with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return self.with_updated_jobs([new_job])\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n evaluation_fn: Optional[RawSensorEvaluationFunction] = None,\n job_name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if (\n sum(\n [\n int(job is not None),\n int(jobs is not None),\n int(job_name is not None),\n int(asset_selection is not None),\n ]\n )\n > 1\n ):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide more than one of 'job', 'jobs', 'job_name', and "\n "'asset_selection' params to SensorDefinition. Must provide only one."\n )\n\n jobs = jobs if jobs else [job] if job else None\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if job_name:\n targets = [\n RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n elif asset_selection:\n targets = []\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn: RawSensorEvaluationFunction = check.callable_param(\n evaluation_fn, "evaluation_fn"\n )\n self._evaluation_fn: Union[\n SensorEvaluationFunction,\n Callable[\n [SensorEvaluationContext],\n List[Union[SkipReason, RunRequest, DagsterRunReaction]],\n ],\n ] = wrap_sensor_evaluation(self._name, evaluation_fn)\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets: Sequence[Union[RepoRelativeTarget, DirectTarget]] = check.opt_list_param(\n targets, "targets", (DirectTarget, RepoRelativeTarget)\n )\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n self._asset_selection = check.opt_inst_param(\n asset_selection, "asset_selection", AssetSelection\n )\n validate_resource_annotated_function(self._raw_fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(self._raw_fn)}\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @sensor decorator and as arguments to"\n " the decorated function",\n )\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n evaluation_fn: Optional[RawSensorEvaluationFunction],\n job_name: Optional[str],\n minimum_interval_seconds: Optional[int],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n jobs: Optional[Sequence[ExecutableDefinition]],\n default_status: DefaultSensorStatus,\n asset_selection: Optional[AssetSelection],\n required_resource_keys: Optional[Set[str]],\n ) -> "SensorDefinition":\n return SensorDefinition(\n name=name,\n evaluation_fn=evaluation_fn,\n job_name=job_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name_if_present = get_context_param_name(self._raw_fn)\n context = get_or_create_sensor_context(self._raw_fn, *args, **kwargs)\n\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n\n resources = validate_and_get_resource_dict(\n context.resources, self.name, self._required_resource_keys\n )\n return self._raw_fn(**context_param, **resources)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this sensor."""\n return self._required_resource_keys\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this sensor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this sensor."""\n return self._description\n\n @public\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n """Optional[int]: The minimum number of seconds between sequential evaluations of this sensor."""\n return self._min_interval\n\n @property\n def targets(self) -> Sequence[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @public\n @property\n def job(self) -> Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].target\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @public\n @property\n def jobs(self) -> List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]: A list of jobs\n that are targeted by this schedule.\n """\n if self._targets and all(isinstance(target, DirectTarget) for target in self._targets):\n return [target.target for target in self._targets] # type: ignore # (illegible conditional)\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.STANDARD\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n context = check.inst_param(context, "context", SensorEvaluationContext)\n\n result = self._evaluation_fn(context)\n\n skip_message: Optional[str] = None\n run_requests: List[RunRequest] = []\n dagster_run_reactions: List[DagsterRunReaction] = []\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = []\n updated_cursor = context.cursor\n asset_events = []\n\n if not result or result == [None]:\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, DagsterRunReaction, SensorResult))\n\n if isinstance(item, SensorResult):\n run_requests = list(item.run_requests) if item.run_requests else []\n skip_message = (\n item.skip_reason.skip_message\n if item.skip_reason\n else (None if run_requests else "Sensor function returned an empty result")\n )\n\n _check_dynamic_partitions_requests(\n item.dynamic_partitions_requests or [],\n )\n dynamic_partitions_requests = item.dynamic_partitions_requests or []\n\n if item.cursor and context.cursor_updated:\n raise DagsterInvariantViolationError(\n "SensorResult.cursor cannot be set if context.update_cursor() was called."\n )\n updated_cursor = item.cursor\n asset_events = item.asset_events\n\n elif isinstance(item, RunRequest):\n run_requests = [item]\n elif isinstance(item, SkipReason):\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n elif isinstance(item, DagsterRunReaction):\n dagster_run_reactions = (\n [cast(DagsterRunReaction, item)] if isinstance(item, DagsterRunReaction) else []\n )\n else:\n check.failed(f"Unexpected type {type(item)} in sensor result")\n else:\n if any(isinstance(item, SensorResult) for item in result):\n check.failed(\n "When a SensorResult is returned from a sensor, it must be the only object"\n " returned."\n )\n\n check.is_list(result, (SkipReason, RunRequest, DagsterRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n run_requests = [item for item in result if isinstance(item, RunRequest)]\n dagster_run_reactions = [\n item for item in result if isinstance(item, DagsterRunReaction)\n ]\n\n if has_skip:\n if len(run_requests) > 0:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif len(dagster_run_reactions) > 0:\n check.failed(\n "Expected a single SkipReason or one or more DagsterRunReaction: "\n "received both DagsterRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n _check_dynamic_partitions_requests(dynamic_partitions_requests)\n resolved_run_requests = self.resolve_run_requests(\n run_requests, context, self._asset_selection, dynamic_partitions_requests\n )\n\n return SensorExecutionData(\n resolved_run_requests,\n skip_message,\n updated_cursor,\n dagster_run_reactions,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(\n self,\n ) -> Sequence[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """Returns job/graph definitions that have been directly passed into the sensor definition.\n Any jobs or graphs that are referenced by name will not be loaded.\n """\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def resolve_run_requests(\n self,\n run_requests: Sequence[RunRequest],\n context: SensorEvaluationContext,\n asset_selection: Optional[AssetSelection],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n ) -> Sequence[RunRequest]:\n def _get_repo_job_by_name(context: SensorEvaluationContext, job_name: str) -> JobDefinition:\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_sensor_context when yielding partitioned"\n " run requests"\n )\n return context.repository_def.get_job(job_name)\n\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.job_name for target in self._targets]\n\n if run_requests and len(self._targets) == 0 and not self._asset_selection:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (job_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or job_name to the @sensor "\n "decorator."\n )\n\n if asset_selection:\n run_requests = [\n *_run_requests_with_base_asset_jobs(run_requests, context, asset_selection)\n ]\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # Run requests may contain an invalid target, or a partition key that does not exist.\n # We will resolve these run requests, applying the target and partition config/tags.\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not"\n " specify job_name for the requested run. Expected one of:"\n f" {target_names}"\n )\n elif (\n run_request.job_name\n and run_request.job_name not in target_names\n and not asset_selection\n ):\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n if run_request.partition_key and not run_request.has_resolved_partition():\n selected_job = _get_repo_job_by_name(\n context, run_request.job_name if run_request.job_name else target_names[0]\n )\n resolved_run_requests.append(\n run_request.with_resolved_tags_and_config(\n target_definition=selected_job,\n current_time=None,\n dynamic_partitions_store=dynamic_partitions_store,\n dynamic_partitions_requests=dynamic_partitions_requests,\n )\n )\n else:\n resolved_run_requests.append(run_request)\n\n return resolved_run_requests\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @public\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The name of the job that is targeted by this sensor."""\n if len(self._targets) > 1:\n raise DagsterInvalidInvocationError(\n f"Cannot use `job_name` property for sensor {self.name}, which targets multiple"\n " jobs."\n )\n return self._targets[0].job_name\n\n @public\n @property\n def default_status(self) -> DefaultSensorStatus:\n """DefaultSensorStatus: The default status for this sensor when it is first loaded in\n a code location.\n """\n return self._default_status\n\n @property\n def asset_selection(self) -> Optional[AssetSelection]:\n return self._asset_selection
\n\n\n@whitelist_for_serdes(\n storage_field_names={"dagster_run_reactions": "pipeline_run_reactions"},\n)\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("dagster_run_reactions", Optional[Sequence[DagsterRunReaction]]),\n ("captured_log_key", Optional[Sequence[str]]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]]\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]]\n ] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_sequence_param(dagster_run_reactions, "dagster_run_reactions", DagsterRunReaction)\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n )\n check.opt_sequence_param(\n asset_events,\n "asset_events",\n (AssetMaterialization, AssetObservation, AssetCheckEvaluation),\n )\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n dagster_run_reactions=dagster_run_reactions,\n captured_log_key=captured_log_key,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events or [],\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: RawSensorEvaluationFunction,\n) -> SensorEvaluationFunction:\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: SensorEvaluationContext):\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, sensor_name, resource_arg_names\n )\n\n context_param_name_if_present = get_context_param_name(fn)\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n raw_evaluation_result = fn(**context_param, **resource_args_populated)\n\n def check_returned_scalar(scalar):\n if isinstance(scalar, (SkipReason, RunRequest, SensorResult)):\n return scalar\n elif scalar is not None:\n raise Exception(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{scalar} of type {type(scalar)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n if inspect.isgenerator(raw_evaluation_result):\n result = []\n try:\n while True:\n result.append(next(raw_evaluation_result))\n except StopIteration as e:\n # captures the case where the evaluation function has a yield and also returns a\n # value\n if e.value is not None:\n result.append(check_returned_scalar(e.value))\n\n return result\n elif isinstance(raw_evaluation_result, list):\n return raw_evaluation_result\n else:\n return [check_returned_scalar(raw_evaluation_result)]\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n resources (Optional[Mapping[str, ResourceDefinition]]): A set of resource definitions\n to provide to the sensor. If passed, these will override any resource definitions\n provided by the repository.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Examples:\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n\n return SensorEvaluationContext(\n instance_ref=instance_ref,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n sensor_name=sensor_name,\n resources=wrap_resources_for_execution(resources),\n )
\n\n\nT = TypeVar("T")\n\n\ndef get_sensor_context_from_args_or_kwargs(\n fn: Callable,\n args: Tuple[Any, ...],\n kwargs: Dict[str, Any],\n context_type: Type[T],\n) -> Optional[T]:\n from dagster._config.pythonic_config import is_coercible_to_resource\n\n context_param_name = get_context_param_name(fn)\n\n kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n if len(args) + len(kwarg_keys_non_resource) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple non-resource arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n if any(is_coercible_to_resource(arg) for arg in args):\n raise DagsterInvalidInvocationError(\n "If directly invoking a sensor, you may not provide resources as"\n " positional"\n " arguments, only as keyword arguments."\n )\n\n context: Optional[T] = None\n\n if len(args) > 0:\n context = check.opt_inst(args[0], context_type)\n elif len(kwargs) > 0:\n if context_param_name and context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst(kwargs.get(context_param_name or "context"), context_type)\n elif context_param_name:\n # If the context parameter is present but no value was provided, we error\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n\n return context\n\n\ndef get_or_create_sensor_context(\n fn: Callable,\n *args: Any,\n **kwargs: Any,\n) -> SensorEvaluationContext:\n """Based on the passed resource function and the arguments passed to it, returns the\n user-passed SensorEvaluationContext or creates one if it is not passed.\n\n Raises an exception if the user passes more than one argument or if the user-provided\n function requires a context parameter but none is passed.\n """\n context = (\n get_sensor_context_from_args_or_kwargs(\n fn,\n args,\n kwargs,\n context_type=SensorEvaluationContext,\n )\n or build_sensor_context()\n )\n resource_args_from_kwargs = {}\n\n resource_args = {param.name for param in get_resource_args(fn)}\n for resource_arg in resource_args:\n if resource_arg in kwargs:\n resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n if resource_args_from_kwargs:\n return context.merge_resources(resource_args_from_kwargs)\n\n return context\n\n\ndef _run_requests_with_base_asset_jobs(\n run_requests: Iterable[RunRequest],\n context: SensorEvaluationContext,\n outer_asset_selection: AssetSelection,\n) -> Sequence[RunRequest]:\n """For sensors that target asset selections instead of jobs, finds the corresponding base asset\n for a selected set of assets.\n """\n asset_graph = context.repository_def.asset_graph # type: ignore # (possible none)\n result = []\n for run_request in run_requests:\n if run_request.asset_selection:\n asset_keys = run_request.asset_selection\n\n unexpected_asset_keys = (\n AssetSelection.keys(*asset_keys) - outer_asset_selection\n ).resolve(asset_graph)\n if unexpected_asset_keys:\n raise DagsterInvalidSubsetError(\n "RunRequest includes asset keys that are not part of sensor's asset_selection:"\n f" {unexpected_asset_keys}"\n )\n else:\n asset_keys = outer_asset_selection.resolve(asset_graph)\n\n base_job = context.repository_def.get_implicit_job_def_for_assets(asset_keys) # type: ignore # (possible none)\n result.append(\n run_request.with_replaced_attrs(\n job_name=base_job.name, asset_selection=list(asset_keys) # type: ignore # (possible none)\n )\n )\n\n return result\n
", "current_page_name": "_modules/dagster/_core/definitions/sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.sensor_definition"}, "source_asset": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.source_asset

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param, public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    DataVersion,\n    DataVersionsByPartition,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataMapping,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceAddable,\n    ResourceRequirement,\n    SourceAssetIOManagerRequirement,\n    ensure_requirements_satisfied,\n    get_resource_key_conflicts,\n)\nfrom dagster._core.definitions.utils import (\n    DEFAULT_GROUP_NAME,\n    DEFAULT_IO_MANAGER_KEY,\n    validate_group_name,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidObservationError,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n    )\nfrom dagster._core.storage.io_manager import IOManagerDefinition\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n# Going with this catch-all for the time-being to permit pythonic resources\nSourceAssetObserveFunction: TypeAlias = Callable[..., Any]\n\n\ndef wrap_source_asset_observe_fn_in_op_compute_fn(\n    source_asset: "SourceAsset",\n) -> "DecoratedOpFunction":\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n        is_context_provided,\n    )\n    from dagster._core.execution.context.compute import (\n        OpExecutionContext,\n    )\n\n    check.not_none(source_asset.observe_fn, "Must be an observable source asset")\n    assert source_asset.observe_fn  # for type checker\n\n    observe_fn = source_asset.observe_fn\n\n    observe_fn_has_context = is_context_provided(get_function_params(observe_fn))\n\n    def fn(context: OpExecutionContext) -> None:\n        resource_kwarg_keys = [param.name for param in get_resource_args(observe_fn)]\n        resource_kwargs = {key: getattr(context.resources, key) for key in resource_kwarg_keys}\n        observe_fn_return_value = (\n            observe_fn(context, **resource_kwargs)\n            if observe_fn_has_context\n            else observe_fn(**resource_kwargs)\n        )\n\n        if isinstance(observe_fn_return_value, DataVersion):\n            if source_asset.partitions_def is not None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is partitioned, so its observe function should return a"\n                    " DataVersionsByPartition, not a DataVersion"\n                )\n\n            context.log_event(\n                AssetObservation(\n                    asset_key=source_asset.key,\n                    tags={DATA_VERSION_TAG: observe_fn_return_value.value},\n                )\n            )\n        elif isinstance(observe_fn_return_value, DataVersionsByPartition):\n            if source_asset.partitions_def is None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is not partitioned, so its observe function should return"\n                    " a DataVersion, not a DataVersionsByPartition"\n                )\n\n            for (\n                partition_key,\n                data_version,\n            ) in observe_fn_return_value.data_versions_by_partition.items():\n                context.log_event(\n                    AssetObservation(\n                        asset_key=source_asset.key,\n                        tags={DATA_VERSION_TAG: data_version.value},\n                        partition=partition_key,\n                    )\n                )\n        else:\n            raise DagsterInvalidObservationError(\n                f"Observe function for {source_asset.key} must return a DataVersion or"\n                " DataVersionsByPartition, but returned a value of type"\n                f" {type(observe_fn_return_value)}"\n            )\n\n    return DecoratedOpFunction(fn)\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\nclass SourceAsset(ResourceAddable):\n """A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.\n\n Attributes:\n key (Union[AssetKey, Sequence[str], str]): The key of the asset.\n metadata (Mapping[str, MetadataValue]): Metadata associated with the asset.\n io_manager_key (Optional[str]): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n io_manager_def (Optional[IOManagerDefinition]): (Experimental) The definition of the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): (Experimental) resource definitions that may be required by the :py:class:`dagster.IOManagerDefinition` provided in the `io_manager_def` argument.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n observe_fn (Optional[SourceAssetObserveFunction]) Observation function for the source asset.\n """\n\n key: PublicAttr[AssetKey]\n metadata: PublicAttr[MetadataMapping]\n raw_metadata: PublicAttr[ArbitraryMetadataMapping]\n io_manager_key: PublicAttr[Optional[str]]\n _io_manager_def: PublicAttr[Optional[IOManagerDefinition]]\n description: PublicAttr[Optional[str]]\n partitions_def: PublicAttr[Optional[PartitionsDefinition]]\n group_name: PublicAttr[str]\n resource_defs: PublicAttr[Dict[str, ResourceDefinition]]\n observe_fn: PublicAttr[Optional[SourceAssetObserveFunction]]\n _node_def: Optional[OpDefinition] # computed lazily\n auto_observe_interval_minutes: Optional[float]\n\n def __init__(\n self,\n key: CoercibleToAssetKey,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n group_name: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n observe_fn: Optional[SourceAssetObserveFunction] = None,\n *,\n auto_observe_interval_minutes: Optional[float] = None,\n # This is currently private because it is necessary for source asset observation functions,\n # but we have not yet decided on a final API for associated one or more ops with a source\n # asset. If we were to make this public, then we would have a canonical public\n # `required_resource_keys` used for observation that might end up conflicting with a set of\n # required resource keys for a different operation.\n _required_resource_keys: Optional[AbstractSet[str]] = None,\n # Add additional fields to with_resources and with_group below\n ):\n from dagster._core.execution.build_resources import (\n wrap_resources_for_execution,\n )\n\n self.key = AssetKey.from_coercible(key)\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self.raw_metadata = metadata\n self.metadata = normalize_metadata(metadata, allow_invalid=True)\n\n resource_defs_dict = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n if io_manager_def:\n if not io_manager_key:\n io_manager_key = self.key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in resource_defs_dict\n and resource_defs_dict[io_manager_key] != io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = io_manager_def\n\n self.resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n self.io_manager_key = check.opt_str_param(io_manager_key, "io_manager_key")\n self.partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n self.group_name = validate_group_name(group_name)\n self.description = check.opt_str_param(description, "description")\n self.observe_fn = check.opt_callable_param(observe_fn, "observe_fn")\n self._required_resource_keys = check.opt_set_param(\n _required_resource_keys, "_required_resource_keys", of_type=str\n )\n self._node_def = None\n self.auto_observe_interval_minutes = check.opt_numeric_param(\n auto_observe_interval_minutes, "auto_observe_interval_minutes"\n )\n\n def get_io_manager_key(self) -> str:\n return self.io_manager_key or DEFAULT_IO_MANAGER_KEY\n\n @property\n def io_manager_def(self) -> Optional[IOManagerDefinition]:\n io_manager_key = self.get_io_manager_key()\n return cast(\n Optional[IOManagerDefinition],\n self.resource_defs.get(io_manager_key) if io_manager_key else None,\n )\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: The OpDefinition associated with the observation function of an observable\n source asset.\n\n Throws an error if the asset is not observable.\n """\n check.invariant(\n isinstance(self.node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self.node_def)\n\n @public\n @property\n def is_observable(self) -> bool:\n """bool: Whether the asset is observable."""\n return self.node_def is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n @property\n def node_def(self) -> Optional[OpDefinition]:\n """Op that generates observation metadata for a source asset."""\n if self.observe_fn is None:\n return None\n\n if self._node_def is None:\n self._node_def = OpDefinition(\n compute_fn=wrap_source_asset_observe_fn_in_op_compute_fn(self),\n name=self.key.to_python_identifier(),\n description=self.description,\n required_resource_keys=self._required_resource_keys,\n )\n return self._node_def\n\n def with_resources(self, resource_defs) -> "SourceAsset":\n from dagster._core.execution.resources_init import get_transitive_required_resource_keys\n\n overlapping_keys = get_resource_key_conflicts(self.resource_defs, resource_defs)\n if overlapping_keys:\n raise DagsterInvalidInvocationError(\n f"SourceAsset with key {self.key} has conflicting resource "\n "definitions with provided resources for the following keys: "\n f"{sorted(list(overlapping_keys))}. Either remove the existing "\n "resources from the asset or change the resource keys so that "\n "they don't overlap."\n )\n\n merged_resource_defs = merge_dicts(resource_defs, self.resource_defs)\n\n # Ensure top-level resource requirements are met - except for\n # io_manager, since that is a default it can be resolved later.\n ensure_requirements_satisfied(merged_resource_defs, list(self.get_resource_requirements()))\n\n io_manager_def = merged_resource_defs.get(self.get_io_manager_key())\n if not io_manager_def and self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with asset key {self.key} requires IO manager with key"\n f" '{self.get_io_manager_key()}', but none was provided."\n )\n relevant_keys = get_transitive_required_resource_keys(\n {*self._required_resource_keys, self.get_io_manager_key()}, merged_resource_defs\n )\n\n relevant_resource_defs = {\n key: resource_def\n for key, resource_def in merged_resource_defs.items()\n if key in relevant_keys\n }\n\n io_manager_key = (\n self.get_io_manager_key()\n if self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY\n else None\n )\n with disable_dagster_warnings():\n return SourceAsset(\n key=self.key,\n io_manager_key=io_manager_key,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.raw_metadata,\n resource_defs=relevant_resource_defs,\n group_name=self.group_name,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def with_attributes(\n self, group_name: Optional[str] = None, key: Optional[AssetKey] = None\n ) -> "SourceAsset":\n if group_name is not None and self.group_name != DEFAULT_GROUP_NAME:\n raise DagsterInvalidDefinitionError(\n "A group name has already been provided to source asset"\n f" {self.key.to_user_string()}"\n )\n\n with disable_dagster_warnings():\n return SourceAsset(\n key=key or self.key,\n metadata=self.raw_metadata,\n io_manager_key=self.io_manager_key,\n io_manager_def=self.io_manager_def,\n description=self.description,\n partitions_def=self.partitions_def,\n group_name=group_name,\n resource_defs=self.resource_defs,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n if self.node_def is not None:\n yield from self.node_def.get_resource_requirements()\n yield SourceAssetIOManagerRequirement(\n key=self.get_io_manager_key(), asset_key=self.key.to_string()\n )\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, SourceAsset):\n return False\n else:\n return (\n self.key == other.key\n and self.raw_metadata == other.raw_metadata\n and self.io_manager_key == other.io_manager_key\n and self.description == other.description\n and self.group_name == other.group_name\n and self.resource_defs == other.resource_defs\n and self.observe_fn == other.observe_fn\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/source_asset", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.source_asset"}, "step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.step_launcher

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.state import KnownExecutionState\n\n\n
[docs]class StepRunRef(\n NamedTuple(\n "_StepRunRef",\n [\n ("run_config", Mapping[str, object]),\n ("dagster_run", DagsterRun),\n ("run_id", str),\n ("retry_mode", RetryMode),\n ("step_key", str),\n ("recon_job", ReconstructableJob),\n ("known_state", Optional["KnownExecutionState"]),\n ],\n )\n):\n """A serializable object that specifies what's needed to hydrate a step so\n that it can be executed in a process outside the plan process.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n run_config: Mapping[str, object],\n dagster_run: DagsterRun,\n run_id: str,\n retry_mode: RetryMode,\n step_key: str,\n recon_job: ReconstructableJob,\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.plan.state import KnownExecutionState\n\n return super(StepRunRef, cls).__new__(\n cls,\n check.mapping_param(run_config, "run_config", key_type=str),\n check.inst_param(dagster_run, "dagster_run", DagsterRun),\n check.str_param(run_id, "run_id"),\n check.inst_param(retry_mode, "retry_mode", RetryMode),\n check.str_param(step_key, "step_key"),\n check.inst_param(recon_job, "recon_job", ReconstructableJob),\n check.opt_inst_param(known_state, "known_state", KnownExecutionState),\n )
\n\n\n
[docs]class StepLauncher(ABC):\n """A StepLauncher is responsible for executing steps, either in-process or in an external process."""\n\n @abstractmethod\n def launch_step(self, step_context: "StepExecutionContext") -> Iterator["DagsterEvent"]:\n """Args:\n step_context (StepExecutionContext): The context that we're executing the step in.\n\n Returns:\n Iterator[DagsterEvent]: The events for the step.\n """
\n
", "current_page_name": "_modules/dagster/_core/definitions/step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.step_launcher"}, "time_window_partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partition_mapping

\nfrom datetime import datetime\nfrom typing import NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_mapping import PartitionMapping, UpstreamPartitionsResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    TimeWindowPartitionsSubset,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\n@experimental_param(param="allow_nonexistent_upstream_partitions")\nclass TimeWindowPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_TimeWindowPartitionMapping",\n [\n ("start_offset", PublicAttr[int]),\n ("end_offset", PublicAttr[int]),\n ("allow_nonexistent_upstream_partitions", PublicAttr[bool]),\n ],\n ),\n):\n """The default mapping between two TimeWindowPartitionsDefinitions.\n\n A partition in the downstream partitions definition is mapped to all partitions in the upstream\n asset whose time windows overlap it.\n\n This means that, if the upstream and downstream partitions definitions share the same time\n period, then this mapping is essentially the identity partition mapping - plus conversion of\n datetime formats.\n\n If the upstream time period is coarser than the downstream time period, then each partition in\n the downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\n hourly and the upstream is daily, then each hourly partition in the downstream will map to the\n daily partition in the upstream that contains that hour.\n\n If the upstream time period is finer than the downstream time period, then each partition in the\n downstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\n the upstream is hourly, then each daily partition in the downstream asset will map to the 24\n hourly partitions in the upstream that occur on that day.\n\n Attributes:\n start_offset (int): If not 0, then the starts of the upstream windows are shifted by this\n offset relative to the starts of the downstream windows. For example, if start_offset=-1\n and end_offset=0, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-03" and "2022-07-04". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n end_offset (int): If not 0, then the ends of the upstream windows are shifted by this\n offset relative to the ends of the downstream windows. For example, if start_offset=0\n and end_offset=1, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-04" and "2022-07-05". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n allow_nonexistent_upstream_partitions (bool): Defaults to false. If true, does not\n raise an error when mapped upstream partitions fall outside the start-end time window of the\n partitions def. For example, if the upstream partitions def starts on "2023-01-01" but\n the downstream starts on "2022-01-01", setting this bool to true would return no\n partition keys when get_upstream_partitions_for_partitions is called with "2022-06-01".\n When set to false, would raise an error.\n\n Examples:\n .. code-block:: python\n\n from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\n partitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n @asset(partitions_def=partitions_def)\n def asset1():\n ...\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "asset1": AssetIn(\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n )\n }\n )\n def asset2(asset1):\n ...\n """\n\n def __new__(\n cls,\n start_offset: int = 0,\n end_offset: int = 0,\n allow_nonexistent_upstream_partitions: bool = False,\n ):\n return super(TimeWindowPartitionMapping, cls).__new__(\n cls,\n start_offset=check.int_param(start_offset, "start_offset"),\n end_offset=check.int_param(end_offset, "end_offset"),\n allow_nonexistent_upstream_partitions=check.bool_param(\n allow_nonexistent_upstream_partitions,\n "allow_nonexistent_upstream_partitions",\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if not isinstance(downstream_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("downstream_partitions_subset must be a TimeWindowPartitionsSubset")\n\n return self._map_partitions(\n downstream_partitions_subset.partitions_def,\n upstream_partitions_def,\n downstream_partitions_subset,\n start_offset=self.start_offset,\n end_offset=self.end_offset,\n current_time=current_time,\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: Optional[PartitionsDefinition],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the partitions in the downstream asset that map to the given upstream partitions.\n\n Filters for partitions that exist at the given current_time, fetching the current time\n if not provided.\n """\n return self._map_partitions(\n upstream_partitions_subset.partitions_def,\n downstream_partitions_def,\n upstream_partitions_subset,\n end_offset=-self.start_offset,\n start_offset=-self.end_offset,\n current_time=current_time,\n ).partitions_subset\n\n def _map_partitions(\n self,\n from_partitions_def: PartitionsDefinition,\n to_partitions_def: Optional[PartitionsDefinition],\n from_partitions_subset: PartitionsSubset,\n start_offset: int,\n end_offset: int,\n current_time: Optional[datetime] = None,\n ) -> UpstreamPartitionsResult:\n """Maps the partitions in from_partitions_subset to partitions in to_partitions_def.\n\n If partitions in from_partitions_subset represent time windows that do not exist in\n to_partitions_def, raises an error if raise_error_on_invalid_mapped_partition is True.\n Otherwise, filters out the partitions that do not exist in to_partitions_def and returns\n the filtered subset, also returning a bool indicating whether there were mapped time windows\n that did not exist in to_partitions_def.\n """\n if not isinstance(from_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("from_partitions_subset must be a TimeWindowPartitionsSubset")\n\n if not isinstance(from_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("from_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if not isinstance(to_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("to_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if (start_offset != 0 or end_offset != 0) and (\n from_partitions_def.cron_schedule != to_partitions_def.cron_schedule\n ):\n raise DagsterInvalidDefinitionError(\n "Can't use the start_offset or end_offset parameters of"\n " TimeWindowPartitionMapping when the cron schedule of the upstream"\n " PartitionsDefinition is different than the cron schedule of the downstream"\n f" one. Attempted to map from cron schedule '{from_partitions_def.cron_schedule}' "\n f"to cron schedule '{to_partitions_def.cron_schedule}'."\n )\n\n if to_partitions_def.timezone != from_partitions_def.timezone:\n raise DagsterInvalidDefinitionError("Timezones don't match")\n\n # skip fancy mapping logic in the simple case\n if from_partitions_def == to_partitions_def and start_offset == 0 and end_offset == 0:\n return UpstreamPartitionsResult(from_partitions_subset, [])\n\n time_windows = []\n for from_partition_time_window in from_partitions_subset.included_time_windows:\n from_start_dt, from_end_dt = from_partition_time_window\n offsetted_start_dt = _offsetted_datetime(\n from_partitions_def, from_start_dt, start_offset\n )\n offsetted_end_dt = _offsetted_datetime(from_partitions_def, from_end_dt, end_offset)\n\n to_start_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_start_dt.timestamp(), end_closed=False\n )\n if offsetted_start_dt is not None\n else None\n )\n to_end_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_end_dt.timestamp(), end_closed=True\n )\n if offsetted_end_dt is not None\n else None\n )\n\n if to_start_partition_key is not None or to_end_partition_key is not None:\n window_start = (\n to_partitions_def.start_time_for_partition_key(to_start_partition_key)\n if to_start_partition_key\n else cast(TimeWindow, to_partitions_def.get_first_partition_window()).start\n )\n window_end = (\n to_partitions_def.end_time_for_partition_key(to_end_partition_key)\n if to_end_partition_key\n else cast(TimeWindow, to_partitions_def.get_last_partition_window()).end\n )\n\n if window_start < window_end:\n time_windows.append(TimeWindow(window_start, window_end))\n\n first_window = to_partitions_def.get_first_partition_window(current_time=current_time)\n last_window = to_partitions_def.get_last_partition_window(current_time=current_time)\n\n filtered_time_windows = []\n required_but_nonexistent_partition_keys = set()\n\n for time_window in time_windows:\n if (\n first_window\n and last_window\n and time_window.start <= last_window.start\n and time_window.end >= first_window.end\n ):\n window_start = max(time_window.start, first_window.start)\n window_end = min(time_window.end, last_window.end)\n filtered_time_windows.append(TimeWindow(window_start, window_end))\n\n if self.allow_nonexistent_upstream_partitions:\n # If allowed to have nonexistent upstream partitions, do not consider\n # out of range partitions to be invalid\n continue\n else:\n invalid_time_window = None\n if not (first_window and last_window) or (\n time_window.start < first_window.start and time_window.end > last_window.end\n ):\n invalid_time_window = time_window\n elif time_window.start < first_window.start:\n invalid_time_window = TimeWindow(\n time_window.start, min(time_window.end, first_window.start)\n )\n elif time_window.end > last_window.end:\n invalid_time_window = TimeWindow(\n max(time_window.start, last_window.end), time_window.end\n )\n\n if invalid_time_window:\n required_but_nonexistent_partition_keys.update(\n set(\n to_partitions_def.get_partition_keys_in_time_window(\n time_window=invalid_time_window\n )\n )\n )\n\n return UpstreamPartitionsResult(\n TimeWindowPartitionsSubset(\n to_partitions_def,\n num_partitions=sum(\n len(to_partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in filtered_time_windows\n ),\n included_time_windows=filtered_time_windows,\n ),\n sorted(list(required_but_nonexistent_partition_keys)),\n )
\n\n\ndef _offsetted_datetime(\n partitions_def: TimeWindowPartitionsDefinition, dt: datetime, offset: int\n) -> Optional[datetime]:\n for _ in range(abs(offset)):\n if offset < 0:\n prev_window = partitions_def.get_prev_partition_window(dt)\n if prev_window is None:\n return None\n\n dt = prev_window.start\n else:\n # TODO: what if we're at the end of the line?\n next_window = partitions_def.get_next_partition_window(dt)\n if next_window is None:\n return None\n\n dt = next_window.end\n\n return dt\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partition_mapping"}, "time_window_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partitions

\nimport functools\nimport hashlib\nimport json\nimport re\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    FrozenSet,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster._utils.schedules import (\n    cron_string_iterator,\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n)\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    PartitionedConfig,\n    PartitionsDefinition,\n    PartitionsSubset,\n    ScheduleType,\n    cron_schedule_from_schedule_type_and_offsets,\n)\nfrom .partition_key_range import PartitionKeyRange\n\n\n
[docs]class TimeWindow(NamedTuple):\n """An interval that is closed at the start and open at the end.\n\n Attributes:\n start (datetime): A pendulum datetime that marks the start of the window.\n end (datetime): A pendulum datetime that marks the end of the window.\n """\n\n start: PublicAttr[datetime]\n end: PublicAttr[datetime]
\n\n\n
[docs]class TimeWindowPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_TimeWindowPartitionsDefinition",\n [\n ("start", PublicAttr[datetime]),\n ("timezone", PublicAttr[str]),\n ("end", PublicAttr[Optional[datetime]]),\n ("fmt", PublicAttr[str]),\n ("end_offset", PublicAttr[int]),\n ("cron_schedule", PublicAttr[str]),\n ],\n ),\n):\n r"""A set of partitions where each partitions corresponds to a time window.\n\n The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n "0 0 \\\\* \\\\* \\\\*" will result in daily partitions that start at midnight and end at midnight of the\n following day.\n\n The string partition_key associated with each partition corresponds to the start of the\n partition's time window.\n\n The first partition in the set will start on at the first cron_schedule tick that is equal to\n or after the given start datetime. The last partition in the set will end before the current\n time, unless the end_offset argument is set to a positive number.\n\n Args:\n cron_schedule (str): Determines the bounds of the time windows.\n start (datetime): The first partition in the set will start on at the first cron_schedule\n tick that is equal to or after this value.\n timezone (Optional[str]): The timezone in which each time should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end (datetime): The last partition (excluding) in the set.\n fmt (str): The date format to use for partition_keys.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n """\n\n def __new__(\n cls,\n start: Union[datetime, str],\n fmt: str,\n end: Union[datetime, str, None] = None,\n schedule_type: Optional[ScheduleType] = None,\n timezone: Optional[str] = None,\n end_offset: int = 0,\n minute_offset: Optional[int] = None,\n hour_offset: Optional[int] = None,\n day_offset: Optional[int] = None,\n cron_schedule: Optional[str] = None,\n ):\n check.opt_str_param(timezone, "timezone")\n timezone = timezone or "UTC"\n\n if isinstance(start, datetime):\n start_dt = pendulum.instance(start, tz=timezone)\n else:\n start_dt = pendulum.instance(datetime.strptime(start, fmt), tz=timezone)\n\n if not end:\n end_dt = None\n elif isinstance(end, datetime):\n end_dt = pendulum.instance(end, tz=timezone)\n else:\n end_dt = pendulum.instance(datetime.strptime(end, fmt), tz=timezone)\n\n if cron_schedule is not None:\n check.invariant(\n schedule_type is None and not minute_offset and not hour_offset and not day_offset,\n "If cron_schedule argument is provided, then schedule_type, minute_offset, "\n "hour_offset, and day_offset can't also be provided",\n )\n else:\n if schedule_type is None:\n check.failed("One of schedule_type and cron_schedule must be provided")\n\n cron_schedule = cron_schedule_from_schedule_type_and_offsets(\n schedule_type=schedule_type,\n minute_offset=minute_offset or 0,\n hour_offset=hour_offset or 0,\n day_offset=day_offset or 0,\n )\n\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{cron_schedule}' for a"\n " TimeWindowPartitionsDefinition."\n )\n\n return super(TimeWindowPartitionsDefinition, cls).__new__(\n cls, start_dt, timezone, end_dt, fmt, end_offset, cron_schedule\n )\n\n def get_current_timestamp(self, current_time: Optional[datetime] = None) -> float:\n return (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ).timestamp()\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Method added for performance reasons.\n # Fetching partition keys requires significantly more compute time to\n # string format datetimes.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n\n num_partitions = 0\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n num_partitions += 1\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n num_partitions += self.end_offset\n\n return num_partitions\n\n def get_partition_keys_between_indexes(\n self, start_idx: int, end_idx: int, current_time: Optional[datetime] = None\n ) -> List[str]:\n # Fetches the partition keys between the given start and end indices.\n # Start index is inclusive, end index is exclusive.\n # Method added for performance reasons, to only string format\n # partition keys included within the indices.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys = []\n reached_end = False\n\n for idx, time_window in enumerate(self._iterate_time_windows(self.start)):\n if time_window.end.timestamp() >= current_timestamp:\n reached_end = True\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n reached_end = True\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n if idx >= start_idx and idx < end_idx:\n partition_keys.append(time_window.start.strftime(self.fmt))\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n if len(partition_keys) >= end_idx - start_idx:\n break\n\n if reached_end and self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys: List[str] = []\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n partition_keys.append(time_window.start.strftime(self.fmt))\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def _get_validated_time_window_for_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n """Returns a TimeWindow for the given partition key if it is valid, otherwise returns None."""\n try:\n time_window = self.time_window_for_partition_key(partition_key)\n except ValueError:\n return None\n\n first_partition_window = self.get_first_partition_window(current_time=current_time)\n last_partition_window = self.get_last_partition_window(current_time=current_time)\n if (\n first_partition_window is None\n or last_partition_window is None\n or time_window.start < first_partition_window.start\n or time_window.start > last_partition_window.start\n or time_window.start.strftime(self.fmt) != partition_key\n ):\n return None\n\n return time_window\n\n def __str__(self) -> str:\n schedule_str = (\n self.schedule_type.value.capitalize() if self.schedule_type else self.cron_schedule\n )\n partition_def_str = (\n f"{schedule_str}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n )\n if self.end_offset != 0:\n partition_def_str += (\n " End offsetted by"\n f" {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n )\n return partition_def_str\n\n def __repr__(self):\n # Between python 3.8 and 3.9 the repr of a datetime object changed.\n # Replaces start time with timestamp as a workaround to make sure the repr is consistent across versions.\n return (\n f"TimeWindowPartitionsDefinition(start={self.start.timestamp()},"\n f" timezone='{self.timezone}', fmt='{self.fmt}', end_offset={self.end_offset},"\n f" cron_schedule='{self.cron_schedule}')"\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n @functools.lru_cache(maxsize=100)\n def _time_window_for_partition_key(self, *, partition_key: str) -> TimeWindow:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return next(iter(self._iterate_time_windows(partition_key_dt)))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n return self._time_window_for_partition_key(partition_key=partition_key)\n\n @functools.lru_cache(maxsize=5)\n def time_windows_for_partition_keys(\n self,\n partition_keys: FrozenSet[str],\n validate: bool = True,\n ) -> Sequence[TimeWindow]:\n if len(partition_keys) == 0:\n return []\n\n sorted_pks = sorted(partition_keys, key=lambda pk: datetime.strptime(pk, self.fmt))\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(datetime.strptime(sorted_pks[0], self.fmt), tz=self.timezone)\n )\n )\n partition_key_time_windows: List[TimeWindow] = []\n for partition_key in sorted_pks:\n next_window = next(cur_windows_iterator)\n if next_window.start.strftime(self.fmt) == partition_key:\n partition_key_time_windows.append(next_window)\n else:\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n )\n )\n partition_key_time_windows.append(next(cur_windows_iterator))\n\n if validate:\n start_time_window = self.get_first_partition_window()\n end_time_window = self.get_last_partition_window()\n\n if start_time_window is None or end_time_window is None:\n check.failed("No partitions in the PartitionsDefinition")\n\n start_timestamp = start_time_window.start.timestamp()\n end_timestamp = end_time_window.end.timestamp()\n\n partition_key_time_windows = [\n tw\n for tw in partition_key_time_windows\n if tw.start.timestamp() >= start_timestamp and tw.end.timestamp() <= end_timestamp\n ]\n return partition_key_time_windows\n\n def start_time_for_partition_key(self, partition_key: str) -> datetime:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n # the datetime format might not include granular components, so we need to recover them\n # we make the assumption that the parsed partition key is <= the start datetime\n return next(iter(self._iterate_time_windows(partition_key_dt))).start\n\n def get_next_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[str]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n windows_iter = iter(self._iterate_time_windows(partition_key_dt))\n next(windows_iter)\n start_time = next(windows_iter).start\n if start_time >= last_partition_window.end:\n return None\n else:\n return start_time.strftime(self.fmt)\n\n def get_next_partition_window(\n self, end_dt: datetime, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n windows_iter = iter(self._iterate_time_windows(end_dt))\n next_window = next(windows_iter)\n if next_window.start >= last_partition_window.end:\n return None\n else:\n return next_window\n\n def get_prev_partition_window(self, start_dt: datetime) -> Optional[TimeWindow]:\n windows_iter = iter(self._reverse_iterate_time_windows(start_dt))\n prev_window = next(windows_iter)\n first_partition_window = self.get_first_partition_window()\n if first_partition_window is None or prev_window.start < first_partition_window.start:\n return None\n else:\n return prev_window\n\n @functools.lru_cache(maxsize=5)\n def _get_first_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n current_timestamp = current_time.timestamp()\n\n time_window = next(iter(self._iterate_time_windows(self.start)))\n\n if self.end_offset == 0:\n return time_window if time_window.end.timestamp() <= current_timestamp else None\n elif self.end_offset > 0:\n iterator = iter(self._iterate_time_windows(current_time))\n # first returned time window is time window of current time\n curr_window_plus_offset = next(iterator)\n for _ in range(self.end_offset):\n curr_window_plus_offset = next(iterator)\n return (\n time_window\n if time_window.end.timestamp() <= curr_window_plus_offset.start.timestamp()\n else None\n )\n else:\n # end offset < 0\n end_window = None\n iterator = iter(self._reverse_iterate_time_windows(current_time))\n for _ in range(abs(self.end_offset)):\n end_window = next(iterator)\n\n if end_window is None:\n check.failed("end_window should not be None")\n\n return (\n time_window if time_window.end.timestamp() <= end_window.start.timestamp() else None\n )\n\n def get_first_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_first_partition_window(current_time=current_time)\n\n @functools.lru_cache(maxsize=5)\n def _get_last_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n if self.get_first_partition_window(current_time) is None:\n return None\n\n current_time = (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n )\n\n if self.end and self.end < current_time:\n current_time = self.end\n\n if self.end_offset == 0:\n return next(iter(self._reverse_iterate_time_windows(current_time)))\n else:\n # TODO: make this efficient\n last_partition_key = super().get_last_partition_key(current_time)\n return (\n self.time_window_for_partition_key(last_partition_key)\n if last_partition_key\n else None\n )\n\n def get_last_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_last_partition_window(current_time=current_time)\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n first_window = self.get_first_partition_window(current_time)\n if first_window is None:\n return None\n\n return first_window.start.strftime(self.fmt)\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n last_window = self.get_last_partition_window(current_time)\n if last_window is None:\n return None\n\n return last_window.start.strftime(self.fmt)\n\n def end_time_for_partition_key(self, partition_key: str) -> datetime:\n return self.time_window_for_partition_key(partition_key).end\n\n @functools.lru_cache(maxsize=5)\n def get_partition_keys_in_time_window(self, time_window: TimeWindow) -> Sequence[str]:\n result: List[str] = []\n for partition_time_window in self._iterate_time_windows(time_window.start):\n if partition_time_window.start < time_window.end:\n result.append(partition_time_window.start.strftime(self.fmt))\n else:\n break\n return result\n\n def get_partition_key_range_for_time_window(self, time_window: TimeWindow) -> PartitionKeyRange:\n start_partition_key = self.get_partition_key_for_timestamp(time_window.start.timestamp())\n end_partition_key = self.get_partition_key_for_timestamp(\n cast(TimeWindow, self.get_prev_partition_window(time_window.end)).start.timestamp()\n )\n\n return PartitionKeyRange(start_partition_key, end_partition_key)\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n start_time = self.start_time_for_partition_key(partition_key_range.start)\n end_time = self.end_time_for_partition_key(partition_key_range.end)\n\n return self.get_partition_keys_in_time_window(TimeWindow(start_time, end_time))\n\n @public\n @property\n def schedule_type(self) -> Optional[ScheduleType]:\n """Optional[ScheduleType]: An enum representing the partition cadence (hourly, daily,\n weekly, or monthly).\n """\n if re.fullmatch(r"\\d+ \\* \\* \\* \\*", self.cron_schedule):\n return ScheduleType.HOURLY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\*", self.cron_schedule):\n return ScheduleType.DAILY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\d+", self.cron_schedule):\n return ScheduleType.WEEKLY\n elif re.fullmatch(r"\\d+ \\d+ \\d+ \\* \\*", self.cron_schedule):\n return ScheduleType.MONTHLY\n else:\n return None\n\n @public\n @property\n def minute_offset(self) -> int:\n """int: Number of minutes past the hour to "split" partitions. Defaults to 0.\n\n For example, returns 15 if each partition starts at 15 minutes past the hour.\n """\n match = re.fullmatch(r"(\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no minute offset")\n return int(match.groups()[0])\n\n @public\n @property\n def hour_offset(self) -> int:\n """int: Number of hours past 00:00 to "split" partitions. Defaults to 0.\n\n For example, returns 1 if each partition starts at 01:00.\n """\n match = re.fullmatch(r"(\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no hour offset")\n return int(match.groups()[1])\n\n @public\n @property\n def day_offset(self) -> int:\n """int: For a weekly or monthly partitions definition, returns the day to "split" partitions\n by. Each partition will start on this day, and end before this day in the following\n week/month. Returns 0 if the day_offset parameter is unset in the\n WeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.\n\n For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\n Saturday). Providing a value of 1 means that a partition will exist weekly from Monday to\n the following Sunday.\n\n For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\n last possible day of the month).\n """\n schedule_type = self.schedule_type\n if schedule_type == ScheduleType.WEEKLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[4])\n elif schedule_type == ScheduleType.MONTHLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[2])\n else:\n check.failed(f"Unsupported schedule type for day_offset: {schedule_type}")\n\n
[docs] @public\n def get_cron_schedule(\n self,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n ) -> str:\n """The schedule executes at the cadence specified by the partitioning, but may overwrite\n the minute/hour/day offset of the partitioning.\n\n This is useful e.g. if you have partitions that span midnight to midnight but you want to\n schedule a job that runs at 2 am.\n """\n if (\n minute_of_hour is None\n and hour_of_day is None\n and day_of_week is None\n and day_of_month is None\n ):\n return self.cron_schedule\n\n schedule_type = self.schedule_type\n if schedule_type is None:\n check.failed(\n f"{self.cron_schedule} does not support"\n " minute_of_hour/hour_of_day/day_of_week/day_of_month arguments"\n )\n\n minute_of_hour = cast(\n int,\n check.opt_int_param(minute_of_hour, "minute_of_hour", default=self.minute_offset),\n )\n\n if schedule_type == ScheduleType.HOURLY:\n check.invariant(\n hour_of_day is None, "Cannot set hour parameter with hourly partitions."\n )\n else:\n hour_of_day = cast(\n int, check.opt_int_param(hour_of_day, "hour_of_day", default=self.hour_offset)\n )\n\n if schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if schedule_type == ScheduleType.MONTHLY:\n default = self.day_offset or 1\n day_offset = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif schedule_type == ScheduleType.WEEKLY:\n default = self.day_offset or 0\n day_offset = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n day_offset = 0\n\n return cron_schedule_from_schedule_type_and_offsets(\n schedule_type,\n minute_offset=minute_of_hour,\n hour_offset=hour_of_day or 0,\n day_offset=day_offset,\n )
\n\n def _iterate_time_windows(self, start: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that start after the given start time."""\n start_timestamp = pendulum.instance(start, tz=self.timezone).timestamp()\n iterator = cron_string_iterator(\n start_timestamp=start_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n prev_time = next(iterator)\n while prev_time.timestamp() < start_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(prev_time, next_time)\n prev_time = next_time\n\n def _reverse_iterate_time_windows(self, end: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that end before the given end time."""\n end_timestamp = pendulum.instance(end, tz=self.timezone).timestamp()\n iterator = reverse_cron_string_iterator(\n end_timestamp=end_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n\n prev_time = next(iterator)\n while prev_time.timestamp() > end_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(next_time, prev_time)\n prev_time = next_time\n\n def get_partition_key_for_timestamp(self, timestamp: float, end_closed: bool = False) -> str:\n """Args:\n timestamp (float): Timestamp from the unix epoch, UTC.\n end_closed (bool): Whether the interval is closed at the end or at the beginning.\n """\n iterator = cron_string_iterator(\n timestamp, self.cron_schedule, self.timezone, start_offset=-1\n )\n # prev will be < timestamp\n prev = next(iterator)\n # prev_next will be >= timestamp\n prev_next = next(iterator)\n\n if end_closed or prev_next.timestamp() > timestamp:\n return prev.strftime(self.fmt)\n else:\n return prev_next.strftime(self.fmt)\n\n def less_than(self, partition_key1: str, partition_key2: str) -> bool:\n """Returns true if the partition_key1 is earlier than partition_key2."""\n return self.start_time_for_partition_key(\n partition_key1\n ) < self.start_time_for_partition_key(partition_key2)\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return TimeWindowPartitionsSubset\n\n def empty_subset(self) -> "PartitionsSubset":\n return self.partitions_subset_class.empty_subset(self)\n\n def is_valid_partition_key(self, partition_key: str) -> bool:\n try:\n partition_time = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return partition_time >= self.start\n except ValueError:\n return False\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(self.__repr__().encode("utf-8")).hexdigest()\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return bool(self._get_validated_time_window_for_partition_key(partition_key, current_time))
\n\n\n
[docs]class DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of daily partitions.\n\n The first partition in the set will start at the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset and/or hour_offset are used, the start and end times of\n each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n DailyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(DailyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\ndef wrap_time_window_run_config_fn(\n run_config_fn: Optional[Callable[[datetime, datetime], Mapping[str, Any]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, Any]]:\n def _run_config_wrapper(key: str) -> Mapping[str, Any]:\n if not run_config_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return run_config_fn(time_window.start, time_window.end)\n\n return _run_config_wrapper\n\n\ndef wrap_time_window_tags_fn(\n tags_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, str]]:\n def _tag_wrapper(key: str) -> Mapping[str, str]:\n if not tags_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return tags_fn(time_window.start, time_window.end)\n\n return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[DailyPartitionsDefinition],\n]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[DailyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[HourlyPartitionsDefinition],\n]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[HourlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[MonthlyPartitionsDefinition],\n]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[MonthlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[WeeklyPartitionsDefinition],\n]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[WeeklyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\nclass TimeWindowPartitionsSubset(PartitionsSubset):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self,\n partitions_def: TimeWindowPartitionsDefinition,\n num_partitions: int,\n included_time_windows: Optional[Sequence[TimeWindow]] = None,\n included_partition_keys: Optional[AbstractSet[str]] = None,\n ):\n self._partitions_def = check.inst_param(\n partitions_def, "partitions_def", TimeWindowPartitionsDefinition\n )\n self._included_time_windows = included_time_windows\n self._num_partitions = num_partitions\n\n check.param_invariant(\n not (included_partition_keys and included_time_windows),\n "Cannot specify both included_partition_keys and included_time_windows",\n )\n self._included_time_windows = check.opt_nullable_sequence_param(\n included_time_windows, "included_time_windows", of_type=TimeWindow\n )\n\n self._included_partition_keys = check.opt_nullable_set_param(\n included_partition_keys, "included_partition_keys", of_type=str\n )\n\n @property\n def included_time_windows(self) -> Sequence[TimeWindow]:\n if self._included_time_windows is None:\n result_time_windows, _ = self._add_partitions_to_time_windows(\n initial_windows=[],\n partition_keys=list(check.not_none(self._included_partition_keys)),\n validate=False,\n )\n self._included_time_windows = result_time_windows\n return self._included_time_windows\n\n def _get_partition_time_windows_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n ) -> Sequence[TimeWindow]:\n """Returns a list of partition time windows that are not in the subset.\n Each time window is a single partition.\n """\n first_tw = self._partitions_def.get_first_partition_window(current_time=current_time)\n last_tw = self._partitions_def.get_last_partition_window(current_time=current_time)\n\n if not first_tw or not last_tw:\n check.failed("No partitions found")\n\n if len(self.included_time_windows) == 0:\n return [TimeWindow(first_tw.start, last_tw.end)]\n\n time_windows = []\n if first_tw.start < self.included_time_windows[0].start:\n time_windows.append(TimeWindow(first_tw.start, self.included_time_windows[0].start))\n\n for i in range(len(self.included_time_windows) - 1):\n if self.included_time_windows[i].start >= last_tw.end:\n break\n if self.included_time_windows[i].end < last_tw.end:\n if self.included_time_windows[i + 1].start <= last_tw.end:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n self.included_time_windows[i + 1].start,\n )\n )\n else:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n last_tw.end,\n )\n )\n\n if last_tw.end > self.included_time_windows[-1].end:\n time_windows.append(TimeWindow(self.included_time_windows[-1].end, last_tw.end))\n\n return time_windows\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n partition_keys: List[str] = []\n for tw in self._get_partition_time_windows_not_in_subset(current_time):\n partition_keys.extend(self._partitions_def.get_partition_keys_in_time_window(tw))\n return partition_keys\n\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._included_partition_keys is None:\n return [\n pk\n for time_window in self.included_time_windows\n for pk in self._partitions_def.get_partition_keys_in_time_window(time_window)\n ]\n return list(self._included_partition_keys) if self._included_partition_keys else []\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [\n self._partitions_def.get_partition_key_range_for_time_window(window)\n for window in self.included_time_windows\n ]\n\n def _add_partitions_to_time_windows(\n self,\n initial_windows: Sequence[TimeWindow],\n partition_keys: Sequence[str],\n validate: bool = True,\n ) -> Tuple[Sequence[TimeWindow], int]:\n """Merges a set of partition keys into an existing set of time windows, returning the\n minimized set of time windows and the number of partitions added.\n """\n result_windows = [*initial_windows]\n time_windows = self._partitions_def.time_windows_for_partition_keys(\n frozenset(partition_keys), validate=validate\n )\n num_added_partitions = 0\n for window in sorted(time_windows):\n # go in reverse order because it's more common to add partitions at the end than the\n # beginning\n for i in reversed(range(len(result_windows))):\n included_window = result_windows[i]\n lt_end_of_range = window.start < included_window.end\n gte_start_of_range = window.start >= included_window.start\n\n if lt_end_of_range and gte_start_of_range:\n break\n\n if not lt_end_of_range:\n merge_with_range = included_window.end == window.start\n merge_with_later_range = i + 1 < len(result_windows) and (\n window.end == result_windows[i + 1].start\n )\n\n if merge_with_range and merge_with_later_range:\n result_windows[i] = TimeWindow(\n included_window.start, result_windows[i + 1].end\n )\n del result_windows[i + 1]\n elif merge_with_range:\n result_windows[i] = TimeWindow(included_window.start, window.end)\n elif merge_with_later_range:\n result_windows[i + 1] = TimeWindow(window.start, result_windows[i + 1].end)\n else:\n result_windows.insert(i + 1, window)\n\n num_added_partitions += 1\n break\n else:\n if result_windows and window.start == result_windows[0].start:\n result_windows[0] = TimeWindow(window.start, included_window.end) # type: ignore\n else:\n result_windows.insert(0, window)\n\n num_added_partitions += 1\n\n return result_windows, num_added_partitions\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "TimeWindowPartitionsSubset":\n # if we are representing things as a static set of keys, continue doing so\n if self._included_partition_keys is not None:\n new_partitions = {*self._included_partition_keys, *partition_keys}\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=len(new_partitions),\n included_partition_keys=new_partitions,\n )\n\n result_windows, added_partitions = self._add_partitions_to_time_windows(\n self.included_time_windows, list(partition_keys)\n )\n\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=self._num_partitions + added_partitions,\n included_time_windows=result_windows,\n )\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition, serialized: str\n ) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n\n loaded = json.loads(serialized)\n\n def tuples_to_time_windows(tuples):\n return [\n TimeWindow(\n pendulum.from_timestamp(tup[0], tz=partitions_def.timezone),\n pendulum.from_timestamp(tup[1], tz=partitions_def.timezone),\n )\n for tup in tuples\n ]\n\n if isinstance(loaded, list):\n # backwards compatibility\n time_windows = tuples_to_time_windows(loaded)\n num_partitions = sum(\n len(partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in time_windows\n )\n elif isinstance(loaded, dict) and (\n "version" not in loaded or loaded["version"] == cls.SERIALIZATION_VERSION\n ): # version 1\n time_windows = tuples_to_time_windows(loaded["time_windows"])\n num_partitions = loaded["num_partitions"]\n else:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {loaded.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n\n return TimeWindowPartitionsSubset(\n partitions_def, num_partitions=num_partitions, included_time_windows=time_windows\n )\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_unique_id:\n return (\n partitions_def.get_serializable_unique_identifier()\n == serialized_partitions_def_unique_id\n )\n\n if (\n serialized_partitions_def_class_name\n # note: all TimeWindowPartitionsDefinition subclasses will get serialized as raw\n # TimeWindowPartitionsDefinitions, so this class name check will not always pass,\n # hence the unique id check above\n and serialized_partitions_def_class_name != partitions_def.__class__.__name__\n ):\n return False\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n isinstance(data, dict)\n and data.get("time_windows") is not None\n and data.get("num_partitions") is not None\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n return cls(partitions_def, 0, [], set())\n\n def serialize(self) -> str:\n return json.dumps(\n {\n "version": self.SERIALIZATION_VERSION,\n "time_windows": [\n (window.start.timestamp(), window.end.timestamp())\n for window in self.included_time_windows\n ],\n "num_partitions": self._num_partitions,\n }\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition:\n return self._partitions_def\n\n def __eq__(self, other):\n return (\n isinstance(other, TimeWindowPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and (\n # faster comparison, but will not catch all cases\n (\n self._included_time_windows == other._included_time_windows\n and self._included_partition_keys == other._included_partition_keys\n )\n # slower comparison, catches all cases\n or self.included_time_windows == other.included_time_windows\n )\n )\n\n def __len__(self) -> int:\n return self._num_partitions\n\n def __contains__(self, partition_key: str) -> bool:\n if self._included_partition_keys is not None:\n return partition_key in self._included_partition_keys\n\n time_window = self._partitions_def.time_window_for_partition_key(partition_key)\n\n return any(\n time_window.start >= included_time_window.start\n and time_window.start < included_time_window.end\n for included_time_window in self.included_time_windows\n )\n\n def __repr__(self) -> str:\n return f"TimeWindowPartitionsSubset({self.get_partition_key_ranges()})"\n\n\nclass PartitionRangeStatus(Enum):\n MATERIALIZING = "MATERIALIZING"\n MATERIALIZED = "MATERIALIZED"\n FAILED = "FAILED"\n\n\nPARTITION_RANGE_STATUS_PRIORITY = [\n PartitionRangeStatus.MATERIALIZING,\n PartitionRangeStatus.FAILED,\n PartitionRangeStatus.MATERIALIZED,\n]\n\n\nclass PartitionTimeWindowStatus:\n def __init__(self, time_window: TimeWindow, status: PartitionRangeStatus):\n self.time_window = time_window\n self.status = status\n\n def __repr__(self):\n return f"({self.time_window.start} - {self.time_window.end}): {self.status.value}"\n\n def __eq__(self, other):\n return (\n isinstance(other, PartitionTimeWindowStatus)\n and self.time_window == other.time_window\n and self.status == other.status\n )\n\n\ndef _flatten(\n high_pri_time_windows: List[PartitionTimeWindowStatus],\n low_pri_time_windows: List[PartitionTimeWindowStatus],\n) -> List[PartitionTimeWindowStatus]:\n high_pri_time_windows = sorted(high_pri_time_windows, key=lambda t: t.time_window.start)\n low_pri_time_windows = sorted(low_pri_time_windows, key=lambda t: t.time_window.start)\n\n high_pri_idx = 0\n low_pri_idx = 0\n\n filtered_low_pri: List[PartitionTimeWindowStatus] = []\n\n # slice and dice the low pri time windows so there's no overlap with high pri\n while True:\n if low_pri_idx >= len(low_pri_time_windows):\n # reached end of materialized\n break\n if high_pri_idx >= len(high_pri_time_windows):\n # reached end of failed, add all remaining materialized bc there's no overlap\n filtered_low_pri.extend(low_pri_time_windows[low_pri_idx:])\n break\n\n low_pri_tw = low_pri_time_windows[low_pri_idx]\n high_pri_tw = high_pri_time_windows[high_pri_idx]\n\n if low_pri_tw.time_window.start < high_pri_tw.time_window.start:\n if low_pri_tw.time_window.end <= high_pri_tw.time_window.start:\n # low_pri_tw is entirely before high pri\n filtered_low_pri.append(low_pri_tw)\n low_pri_idx += 1\n else:\n # high pri cuts the low pri short\n filtered_low_pri.append(\n PartitionTimeWindowStatus(\n TimeWindow(\n low_pri_tw.time_window.start,\n high_pri_tw.time_window.start,\n ),\n low_pri_tw.status,\n )\n )\n\n if low_pri_tw.time_window.end > high_pri_tw.time_window.end:\n # the low pri time window will continue on the other end of the high pri\n # and get split in two. Modify low_pri[low_pri_idx] to be\n # the second half of the low pri time window. It will be added in the next iteration.\n # (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n else:\n # the rest of the low pri time window is inside the high pri time window\n low_pri_idx += 1\n else:\n if low_pri_tw.time_window.start >= high_pri_tw.time_window.end:\n # high pri is entirely before low pri. The next high pri may overlap\n high_pri_idx += 1\n elif low_pri_tw.time_window.end <= high_pri_tw.time_window.end:\n # low pri is entirely within high pri, skip it\n low_pri_idx += 1\n else:\n # high pri cuts out the start of the low pri. It will continue on the other end.\n # Modify low_pri[low_pri_idx] to shorten the start. It will be added\n # in the next iteration. (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n\n # combine the high pri windwos with the filtered low pri windows\n flattened_time_windows = high_pri_time_windows\n flattened_time_windows.extend(filtered_low_pri)\n flattened_time_windows.sort(key=lambda t: t.time_window.start)\n return flattened_time_windows\n\n\ndef fetch_flattened_time_window_ranges(\n subsets: Mapping[PartitionRangeStatus, TimeWindowPartitionsSubset]\n) -> Sequence[PartitionTimeWindowStatus]:\n """Given potentially overlapping subsets, return a flattened list of timewindows where the highest priority status wins\n on overlaps.\n """\n prioritized_subsets = sorted(\n [(status, subset) for status, subset in subsets.items()],\n key=lambda t: PARTITION_RANGE_STATUS_PRIORITY.index(t[0]),\n )\n\n # progressively add lower priority time windows to the list of higher priority time windows\n flattened_time_window_statuses = []\n for status, subset in prioritized_subsets:\n subset_time_window_statuses = [\n PartitionTimeWindowStatus(tw, status) for tw in subset.included_time_windows\n ]\n flattened_time_window_statuses = _flatten(\n flattened_time_window_statuses, subset_time_window_statuses\n )\n\n return flattened_time_window_statuses\n\n\ndef has_one_dimension_time_window_partitioning(\n partitions_def: Optional[PartitionsDefinition],\n) -> bool:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return True\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n time_window_dims = [\n dim\n for dim in partitions_def.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_window_dims) == 1:\n return True\n\n return False\n\n\ndef get_time_partitions_def(\n partitions_def: Optional[PartitionsDefinition],\n) -> Optional[TimeWindowPartitionsDefinition]:\n """For a given PartitionsDefinition, return the associated TimeWindowPartitionsDefinition if it\n exists.\n """\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None:\n return None\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partitions_def\n elif isinstance(\n partitions_def, MultiPartitionsDefinition\n ) and has_one_dimension_time_window_partitioning(partitions_def):\n return cast(\n TimeWindowPartitionsDefinition, partitions_def.time_window_dimension.partitions_def\n )\n else:\n return None\n\n\ndef get_time_partition_key(\n partitions_def: Optional[PartitionsDefinition], partition_key: Optional[str]\n) -> str:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None or partition_key is None:\n check.failed(\n "Cannot get time partitions key from when partitions def is None or partition key is"\n " None"\n )\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partition_key\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n return partitions_def.get_partition_key_from_str(partition_key).keys_by_dimension[\n partitions_def.time_window_dimension.name\n ]\n else:\n check.failed(f"Cannot get time partition from non-time partitions def {partitions_def}")\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partitions"}, "unresolved_asset_job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.unresolved_asset_job_definition

\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, NamedTuple, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions import AssetKey\nfrom dagster._core.definitions.run_request import RunRequest\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\n\nfrom .asset_layer import build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .metadata import RawMetadataValue\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import (\n        AssetSelection,\n        ExecutorDefinition,\n        HookDefinition,\n        JobDefinition,\n        PartitionedConfig,\n        PartitionsDefinition,\n        ResourceDefinition,\n    )\n    from dagster._core.definitions.asset_graph import InternalAssetGraph\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.run_config import RunConfig\n\n\nclass UnresolvedAssetJobDefinition(\n    NamedTuple(\n        "_UnresolvedAssetJobDefinition",\n        [\n            ("name", str),\n            ("selection", "AssetSelection"),\n            (\n                "config",\n                Optional[Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig"]],\n            ),\n            ("description", Optional[str]),\n            ("tags", Optional[Mapping[str, Any]]),\n            ("metadata", Optional[Mapping[str, RawMetadataValue]]),\n            ("partitions_def", Optional["PartitionsDefinition"]),\n            ("executor_def", Optional["ExecutorDefinition"]),\n            ("hooks", Optional[AbstractSet["HookDefinition"]]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        selection: "AssetSelection",\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n        ] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet["HookDefinition"]] = None,\n    ):\n        from dagster._core.definitions import (\n            AssetSelection,\n            ExecutorDefinition,\n            HookDefinition,\n            PartitionsDefinition,\n        )\n        from dagster._core.definitions.run_config import convert_config_input\n\n        return super(UnresolvedAssetJobDefinition, cls).__new__(\n            cls,\n            name=check.str_param(name, "name"),\n            selection=check.inst_param(selection, "selection", AssetSelection),\n            config=convert_config_input(config),\n            description=check.opt_str_param(description, "description"),\n            tags=check.opt_mapping_param(tags, "tags"),\n            metadata=check.opt_mapping_param(metadata, "metadata"),\n            partitions_def=check.opt_inst_param(\n                partitions_def, "partitions_def", PartitionsDefinition\n            ),\n            executor_def=check.opt_inst_param(executor_def, "partitions_def", ExecutorDefinition),\n            hooks=check.opt_nullable_set_param(hooks, "hooks", of_type=HookDefinition),\n        )\n\n    @deprecated(\n        breaking_version="2.0.0",\n        additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n    )\n    def run_request_for_partition(\n        self,\n        partition_key: str,\n        run_key: Optional[str] = None,\n        tags: Optional[Mapping[str, str]] = None,\n        asset_selection: Optional[Sequence[AssetKey]] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n        current_time: Optional[datetime] = None,\n        dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n    ) -> RunRequest:\n        """Creates a RunRequest object for a run that processes the given partition.\n\n        Args:\n            partition_key: The key of the partition to request a run for.\n            run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n                only one run is created per run key across all sensor evaluations.  For schedules,\n                ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n                value means that a run will always be launched per evaluation.\n            tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n                to the launched run.\n            run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n                a :py:class:`PartitionedConfig`, this value will override replace the config\n                provided by it.\n            current_time (Optional[datetime]): Used to determine which time-partitions exist.\n                Defaults to now.\n            dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n                object that is responsible for fetching dynamic partitions. Required when the\n                partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n                can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n        Returns:\n            RunRequest: an object that requests a run to process the given partition.\n        """\n        from dagster._core.definitions.partition import (\n            DynamicPartitionsDefinition,\n            PartitionedConfig,\n        )\n\n        if not self.partitions_def:\n            check.failed("Called run_request_for_partition on a non-partitioned job")\n\n        partitioned_config = PartitionedConfig.from_flexible_config(\n            self.config, self.partitions_def\n        )\n\n        if (\n            isinstance(self.partitions_def, DynamicPartitionsDefinition)\n            and self.partitions_def.name\n        ):\n            # Do not support using run_request_for_partition with dynamic partitions,\n            # since this requires querying the instance once per run request for the\n            # existent dynamic partitions\n            check.failed(\n                "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n                " RunRequest(partition_key=...)"\n            )\n\n        self.partitions_def.validate_partition_key(\n            partition_key,\n            current_time=current_time,\n            dynamic_partitions_store=dynamic_partitions_store,\n        )\n\n        run_config = (\n            run_config\n            if run_config is not None\n            else partitioned_config.get_run_config_for_partition_key(partition_key)\n        )\n        run_request_tags = {\n            **(tags or {}),\n            **partitioned_config.get_tags_for_partition_key(partition_key),\n        }\n\n        return RunRequest(\n            job_name=self.name,\n            run_key=run_key,\n            run_config=run_config,\n            tags=run_request_tags,\n            asset_selection=asset_selection,\n            partition_key=partition_key,\n        )\n\n    def resolve(\n        self,\n        asset_graph: "InternalAssetGraph",\n        default_executor_def: Optional["ExecutorDefinition"] = None,\n        resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n    ) -> "JobDefinition":\n        """Resolve this UnresolvedAssetJobDefinition into a JobDefinition."""\n        assets = asset_graph.assets\n        source_assets = asset_graph.source_assets\n        selected_asset_keys = self.selection.resolve(asset_graph)\n        selected_asset_checks = self.selection.resolve_checks(asset_graph)\n\n        asset_keys_by_partitions_def = defaultdict(set)\n        for asset_key in selected_asset_keys:\n            partitions_def = asset_graph.get_partitions_def(asset_key)\n            if partitions_def is not None:\n                asset_keys_by_partitions_def[partitions_def].add(asset_key)\n\n        if len(asset_keys_by_partitions_def) > 1:\n            keys_by_partitions_def_str = "\\n".join(\n                f"{partitions_def}: {asset_keys}"\n                for partitions_def, asset_keys in asset_keys_by_partitions_def.items()\n            )\n            raise DagsterInvalidDefinitionError(\n                f"Multiple partitioned assets exist in assets job '{self.name}'. Selected assets"\n                " must have the same partitions definitions, but the selected assets have"\n                f" different partitions definitions: \\n{keys_by_partitions_def_str}"\n            )\n\n        inferred_partitions_def = (\n            next(iter(asset_keys_by_partitions_def.keys()))\n            if asset_keys_by_partitions_def\n            else None\n        )\n        if (\n            inferred_partitions_def\n            and self.partitions_def != inferred_partitions_def\n            and self.partitions_def is not None\n        ):\n            raise DagsterInvalidDefinitionError(\n                f"Job '{self.name}' received a partitions_def of {self.partitions_def}, but the"\n                f" selected assets {next(iter(asset_keys_by_partitions_def.values()))} have a"\n                f" non-matching partitions_def of {inferred_partitions_def}"\n            )\n\n        return build_asset_selection_job(\n            name=self.name,\n            assets=assets,\n            asset_checks=asset_graph.asset_checks,\n            config=self.config,\n            source_assets=source_assets,\n            description=self.description,\n            tags=self.tags,\n            metadata=self.metadata,\n            asset_selection=selected_asset_keys,\n            asset_check_selection=selected_asset_checks,\n            partitions_def=self.partitions_def if self.partitions_def else inferred_partitions_def,\n            executor_def=self.executor_def or default_executor_def,\n            hooks=self.hooks,\n            resource_defs=resource_defs,\n        )\n\n\n
[docs]def define_asset_job(\n name: str,\n selection: Optional["CoercibleToAssetSelection"] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet["HookDefinition"]] = None,\n) -> UnresolvedAssetJobDefinition:\n """Creates a definition of a job which will either materialize a selection of assets or observe\n a selection of source assets. This will only be resolved to a JobDefinition once placed in a\n code location.\n\n Args:\n name (str):\n The name for the job.\n selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]):\n The assets that will be materialized or observed when the job is run.\n\n The selected assets must all be included in the assets that are passed to the assets\n argument of the Definitions object that this job is included on.\n\n The string "my_asset*" selects my_asset and all downstream assets within the code\n location. A list of strings represents the union of all assets selected by strings\n within the list.\n\n The selection will be resolved to a set of assets when the location is loaded. If the\n selection resolves to all source assets, the created job will perform source asset\n observations. If the selection resolves to all regular assets, the created job will\n materialize assets. If the selection resolves to a mixed set of source assets and\n regular assets, an error will be thrown.\n\n config:\n Describes how the Job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]): Arbitrary metadata about the job.\n Keys are displayed string labels, and values are one of the following: string, float,\n int, JSON-serializable dict, JSON-serializable list, and one of the data classes\n returned by a MetadataValue static method.\n description (Optional[str]):\n A description for the Job.\n partitions_def (Optional[PartitionsDefinition]):\n Defines the set of partitions for this job. All AssetDefinitions selected for this job\n must have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\n PartitionsDefinition will be inferred from the selected AssetDefinitions.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n\n\n Returns:\n UnresolvedAssetJobDefinition: The job, which can be placed inside a code location.\n\n Examples:\n .. code-block:: python\n\n # A job that targets all assets in the code location:\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n )\n\n # A job that targets a single asset\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets", selection=[asset1])],\n )\n\n # A job that targets all the assets in a group:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n )\n\n @observable_source_asset\n def source_asset():\n ...\n\n # A job that observes a source asset:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("observation_job", selection=[source_asset])],\n )\n\n # Resources are supplied to the assets, not the job:\n @asset(required_resource_keys={"slack_client"})\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n resources={"slack_client": prod_slack_client},\n )\n\n """\n from dagster._core.definitions import AssetSelection\n\n # convert string-based selections to AssetSelection objects\n if selection is None:\n resolved_selection = AssetSelection.all()\n else:\n resolved_selection = AssetSelection.from_coercible(selection)\n\n return UnresolvedAssetJobDefinition(\n name=name,\n selection=resolved_selection,\n config=config,\n description=description,\n tags=tags,\n metadata=metadata,\n partitions_def=partitions_def,\n executor_def=executor_def,\n hooks=hooks,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/unresolved_asset_job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.unresolved_asset_job_definition"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast\n\nimport yaml\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.storage.tags import check_reserved_tags\nfrom dagster._utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\nDEFAULT_GROUP_NAME = "default"  # asset group_name used when none is provided\nDEFAULT_IO_MANAGER_KEY = "io_manager"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None."""\n\n\ndef has_valid_name_chars(name: str) -> bool:\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str, allow_list: Optional[List[str]] = None) -> str:\n    check.str_param(name, "name")\n\n    if allow_list and name in allow_list:\n        return name\n\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python'\n            " reserved keyword."\n        )\n\n    check_valid_chars(name)\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef check_valid_chars(name: str):\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex'\n            f" {VALID_NAME_REGEX_STR}."\n        )\n\n\ndef is_valid_name(name: str) -> bool:\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key: object, value: object) -> str:\n    return f'{key}="{value!r}"'\n\n\ndef struct_to_string(name: str, **kwargs: object) -> str:\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return f"{name}({props_str})"\n\n\ndef validate_tags(\n    tags: Optional[Mapping[str, Any]], allow_reserved_tags: bool = True\n) -> Mapping[str, str]:\n    valid_tags: Dict[str, str] = {}\n    for key, value in check.opt_mapping_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = f'Could not JSON encode value "{value}"'\n            str_val = None\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = (\n                    'JSON encoding "{json}" of value "{val}" is not equivalent to original value'\n                    .format(json=str_val, val=value)\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    f'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value."\n                )\n\n            valid_tags[key] = str_val  # type: ignore  # (possible none)\n        else:\n            valid_tags[key] = value\n\n    if not allow_reserved_tags:\n        check_reserved_tags(valid_tags)\n\n    return valid_tags\n\n\ndef validate_group_name(group_name: Optional[str]) -> str:\n    """Ensures a string name is valid and returns a default if no name provided."""\n    if group_name:\n        check_valid_chars(group_name)\n        return group_name\n    return DEFAULT_GROUP_NAME\n\n\n
[docs]def config_from_files(config_files: Sequence[str]) -> Mapping[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_sequence_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n f'File or glob pattern "{file_glob}" for "config_files" produced no results.'\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: Sequence[str]) -> Mapping[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.sequence_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: Sequence[Tuple[str, str]]) -> Mapping[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n import pkg_resources # expensive, import only on use\n\n pkg_resource_defs = check.sequence_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/_core/definitions/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nfrom dagster._annotations import public\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n\n\n
[docs]class OpVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for an op.\n\n Attributes:\n op_def (OpDefinition): The definition of the op to compute a version for.\n op_config (Any): The parsed config to be passed to the op during execution.\n """\n\n op_def: "OpDefinition"\n op_config: Any
\n\n\n
[docs]class ResourceVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for a resource.\n\n Attributes:\n resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n resource_config (Any): The parsed config to be passed to the resource during execution.\n """\n\n resource_def: "ResourceDefinition"\n resource_config: Any
\n\n\n
[docs]class VersionStrategy(ABC):\n """Abstract class for defining a strategy to version ops and resources.\n\n When subclassing, `get_op_version` must be implemented, and\n `get_resource_version` can be optionally implemented.\n\n `get_op_version` should ingest an OpVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called\n a `version`, which will\n be tagged to outputs of that op in the job. Providing a\n `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose\n outputs do not have an up-to-date version will run.\n """\n\n
[docs] @public\n @abstractmethod\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n """VersionStrategy that checks for changes to the source code of ops and resources.\n\n Only checks for changes within the immediate body of the op/resource's\n decorated function (or compute function, if the op/resource was\n constructed directly from a definition).\n """\n\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n
[docs] @public\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op by hashing its source code.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource by hashing its source code.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/_core/definitions/version_strategy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster._core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, Type\n\nimport dagster._check as check\nfrom dagster._utils.interrupts import raise_interrupts_as\n\nif TYPE_CHECKING:\n    from dagster._core.log_manager import DagsterLogManager\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions.\n """\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidObservationError(DagsterError):\n """Indicates that an invalid value was returned from a source asset observation function."""\n\n\n
[docs]class DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """
\n\n\nclass DagsterInvalidDeserializationVersionError(DagsterError):\n """Indicates that a serialized value has an unsupported version and cannot be deserialized."""\n\n\nPYTHONIC_CONFIG_ERROR_VERBIAGE = """\nThis config type can be a:\n - Python primitive type\n - int, float, bool, str, list\n - A Python Dict or List type containing other valid types\n - Custom data classes extending dagster.Config\n - A Pydantic discriminated union type (https://docs.pydantic.dev/usage/types/#discriminated-unions-aka-tagged-unions)\n"""\n\nPYTHONIC_RESOURCE_ADDITIONAL_TYPES = """\n\nIf this config type represents a resource dependency, its annotation must either:\n - Extend dagster.ConfigurableResource, dagster.ConfigurableIOManager, or\n - Be wrapped in a ResourceDependency annotation, e.g. ResourceDependency[{invalid_type_str}]\n"""\n\n\ndef _generate_pythonic_config_error_message(\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n) -> str:\n invalid_type_name = getattr(invalid_type, "__name__", "<my type>")\n pythonic_config_error_verbiage = (\n PYTHONIC_CONFIG_ERROR_VERBIAGE + (PYTHONIC_RESOURCE_ADDITIONAL_TYPES if is_resource else "")\n ).format(invalid_type_str=invalid_type_name)\n\n return ("""\nError defining Dagster config class{config_class}{field_name}.\nUnable to resolve config type {invalid_type} to a supported Dagster config type.\n\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""").format(\n config_class=f" {config_class!r}" if config_class else "",\n field_name=f" on field '{field_name}'" if field_name else "",\n invalid_type=repr(invalid_type),\n PYTHONIC_CONFIG_ERROR_VERBIAGE=pythonic_config_error_verbiage,\n )\n\n\nclass DagsterInvalidPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with an invalid value."""\n\n def __init__(\n self,\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n **kwargs,\n ):\n self.invalid_type = invalid_type\n self.field_name = field_name\n self.config_class = config_class\n super(DagsterInvalidPythonicConfigDefinitionError, self).__init__(\n _generate_pythonic_config_error_message(\n config_class=config_class,\n field_name=field_name,\n invalid_type=invalid_type,\n is_resource=is_resource,\n ),\n **kwargs,\n )\n\n\nclass DagsterInvalidDagsterTypeInPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with a DagsterType\n annotated field.\n """\n\n def __init__(\n self,\n config_class_name: str,\n field_name: Optional[str],\n **kwargs,\n ):\n self.field_name = field_name\n super(DagsterInvalidDagsterTypeInPythonicConfigDefinitionError, self).__init__(\n f"""Error defining Dagster config class '{config_class_name}' on field '{field_name}'. DagsterTypes cannot be used to annotate a config type. DagsterType is meant only for type checking and coercion in op and asset inputs and outputs.\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""",\n **kwargs,\n )\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value.\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}"\n + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=f" Reason: {reason}." if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime.\n """
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available.\n """\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts() -> Iterator[None]:\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(\n error_cls: Type["DagsterUserCodeExecutionError"],\n msg_fn: Callable[[], str],\n log_manager: Optional["DagsterLogManager"] = None,\n **kwargs: object,\n) -> Iterator[None]:\n """Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e:\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self) -> bool:\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n f"Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n )\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema).\n """\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster._config import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += f"\\n Error {i_error + 1}: {error.message}"\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster._core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with.\n """\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterMaxRetriesExceededError(DagsterError):\n """Raised when raise_on_error is true, and retries were exceeded, this error should be raised."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterMaxRetriesExceededError, self).__init__(*args, **kwargs)\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterMaxRetriesExceededError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n\nclass DagsterCodeLocationNotFoundError(DagsterError):\n pass\n\n\nclass DagsterCodeLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterCodeLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id."""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata=None, dagster_type=None):\n from dagster import DagsterType\n from dagster._core.definitions.metadata import normalize_metadata\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\nclass DagsterAssetCheckFailedError(DagsterError):\n """Indicates than an asset check failed."""\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n f"Event logs invalid for run id {run_id}"\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key."""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters."""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass FreshnessPolicySensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined freshness policy sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state."""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes.\n """\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """The user has tried to access run config for a partition name that does not exist."""\n\n\nclass DagsterUndefinedDataVersionError(DagsterError):\n """The user attempted to retrieve the most recent logical version for an asset, but no logical version is defined."""\n\n\nclass DagsterAssetBackfillDataLoadError(DagsterError):\n """Indicates that an asset backfill is now unloadable. May happen when (1) a code location containing\n targeted assets is unloadable or (2) and asset or an asset's partitions definition has been removed.\n """\n\n\nclass DagsterDefinitionChangedDeserializationError(DagsterError):\n """Indicates that a stored value can't be deserialized because the definition needed to interpret\n it has changed.\n """\n\n\nclass DagsterPipesExecutionError(DagsterError):\n """Indicates that an error occurred during the execution of an external process."""\n
", "current_page_name": "_modules/dagster/_core/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.errors"}, "event_api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.event_api

\nfrom datetime import datetime\nfrom typing import Callable, Mapping, NamedTuple, Optional, Sequence, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.errors import DagsterInvalidInvocationError\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._serdes import whitelist_for_serdes\n\nEventHandlerFn: TypeAlias = Callable[[EventLogEntry, str], None]\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards.\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]@whitelist_for_serdes\nclass EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not instantiate this class directly.\n """\n\n storage_id: PublicAttr[int]\n event_log_entry: PublicAttr[EventLogEntry]\n\n @property\n def run_id(self) -> str:\n return self.event_log_entry.run_id\n\n @property\n def timestamp(self) -> float:\n return self.event_log_entry.timestamp\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.asset_key\n\n return None\n\n @property\n def partition_key(self) -> Optional[str]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.partition\n\n return None\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n return self.event_log_entry.asset_materialization\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n return self.event_log_entry.asset_observation
\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", DagsterEventType),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[Sequence[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ("storage_ids", Optional[Sequence[int]]),\n ("tags", Optional[Mapping[str, Union[str, Sequence[str]]]]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (DagsterEventType): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: DagsterEventType,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[Sequence[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n storage_ids: Optional[Sequence[int]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n ):\n check.opt_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.inst_param(event_type, "event_type", DagsterEventType)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n if tags and event_type is not DagsterEventType.ASSET_MATERIALIZATION:\n raise DagsterInvalidInvocationError(\n "Can only filter by tags for asset materialization events"\n )\n\n # type-ignores work around mypy type inference bug\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=event_type,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n storage_ids=check.opt_nullable_sequence_param(storage_ids, "storage_ids", of_type=int),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/event_api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.event_api"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nimport sys\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    NodeHandle,\n)\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster._core.definitions.metadata import (\n    MetadataFieldSerializer,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import HookExecutionError\nfrom dagster._core.execution.context.system import IPlanContext, IStepContext, StepExecutionContext\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.inputs import StepInputData\nfrom dagster._core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\nfrom dagster._core.execution.plan.outputs import StepOutputData\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.serdes import UnpackContext\nfrom dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster._utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import ObjectStoreOperation\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.step import StepKind\n\n\nEventSpecificData = Union[\n    StepOutputData,\n    StepFailureData,\n    StepSuccessData,\n    "StepMaterializationData",\n    "StepExpectationResultData",\n    StepInputData,\n    "EngineEventData",\n    "HookErroredData",\n    StepRetryData,\n    "JobFailureData",\n    "JobCanceledData",\n    "ObjectStoreOperationResultData",\n    "HandledOutputData",\n    "LoadedInputData",\n    "ComputeLogsCaptureData",\n    "AssetObservationData",\n    "AssetMaterializationPlannedData",\n    "AssetCheckEvaluation",\n    "AssetCheckEvaluationPlanned",\n]\n\n\n
[docs]class DagsterEventType(str, Enum):\n """The types of events that may be yielded by op and job execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n # The process carrying out step execution is starting/started. Shown as a\n # marker start/end in the Dagster UI.\n STEP_WORKER_STARTING = "STEP_WORKER_STARTING"\n STEP_WORKER_STARTED = "STEP_WORKER_STARTED"\n\n # Resource initialization for execution has started/succeede/failed. Shown\n # as a marker start/end in the Dagster UI.\n RESOURCE_INIT_STARTED = "RESOURCE_INIT_STARTED"\n RESOURCE_INIT_SUCCESS = "RESOURCE_INIT_SUCCESS"\n RESOURCE_INIT_FAILURE = "RESOURCE_INIT_FAILURE"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n ASSET_CHECK_EVALUATION_PLANNED = "ASSET_CHECK_EVALUATION_PLANNED"\n ASSET_CHECK_EVALUATION = "ASSET_CHECK_EVALUATION"\n\n # We want to display RUN_* events in the Dagster UI and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\nMARKER_EVENTS = {\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: DagsterRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: DagsterRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: DagsterRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: DagsterRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: DagsterRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: DagsterRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: DagsterRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\nASSET_CHECK_EVENTS = {\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str,\n expected_type: Union[DagsterEventType, Sequence[DagsterEventType]],\n actual_type: DagsterEventType,\n) -> None:\n _expected_type = (\n [expected_type] if isinstance(expected_type, DagsterEventType) else expected_type\n )\n check.invariant(\n actual_type in _expected_type,\n f"{method} only callable when event_type is"\n f" {','.join([t.value for t in _expected_type])}, called on {actual_type}",\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type in (\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n ):\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluationPlanned)\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluation)\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_job_event(job_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n job_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {job_context.job_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\nclass DagsterEventSerializer(NamedTupleSerializer["DagsterEvent"]):\n def before_unpack(self, context, unpacked_dict: Any) -> Dict[str, Any]:\n event_type_value, event_specific_data = _handle_back_compat(\n unpacked_dict["event_type_value"], unpacked_dict.get("event_specific_data")\n )\n unpacked_dict["event_type_value"] = event_type_value\n unpacked_dict["event_specific_data"] = event_specific_data\n\n return unpacked_dict\n\n def handle_unpack_error(\n self,\n exc: Exception,\n context: UnpackContext,\n storage_dict: Dict[str, Any],\n ) -> "DagsterEvent":\n event_type_value, _ = _handle_back_compat(\n storage_dict["event_type_value"], storage_dict.get("event_specific_data")\n )\n step_key = storage_dict.get("step_key")\n orig_message = storage_dict.get("message")\n new_message = (\n f"Could not deserialize event of type {event_type_value}. This event may have been"\n " written by a newer version of Dagster."\n + (f' Original message: "{orig_message}"' if orig_message else "")\n )\n return DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=storage_dict["pipeline_name"],\n message=new_message,\n step_key=step_key,\n event_specific_data=EngineEventData(\n error=serializable_error_info_from_exc_info(sys.exc_info())\n ),\n )\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterEventSerializer,\n storage_field_names={\n "node_handle": "solid_handle",\n "job_name": "pipeline_name",\n },\n)\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("job_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("node_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Mapping[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by op and job execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n job_name (str)\n node_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_job(\n event_type: DagsterEventType,\n job_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_context.job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_job_event(job_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n event_type: DagsterEventType,\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n job_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n node_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Mapping[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n # old events may contain node_handle but not step_handle\n if node_handle is not None and step_handle is None:\n step_handle = StepHandle(node_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(job_name, "job_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(node_handle, "node_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_mapping_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def node_name(self) -> str:\n check.invariant(self.node_handle is not None)\n node_handle = cast(NodeHandle, self.node_handle)\n return node_handle.name\n\n @public\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @public\n @property\n def is_step_event(self) -> bool:\n """bool: If this event relates to a specific step."""\n return self.event_type in STEP_EVENTS\n\n @public\n @property\n def is_hook_event(self) -> bool:\n """bool: If this event relates to the execution of a hook."""\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster._core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @public\n @property\n def is_step_success(self) -> bool:\n """bool: If this event is of type STEP_SUCCESS."""\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @public\n @property\n def is_successful_output(self) -> bool:\n """bool: If this event is of type STEP_OUTPUT."""\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @public\n @property\n def is_step_start(self) -> bool:\n """bool: If this event is of type STEP_START."""\n return self.event_type == DagsterEventType.STEP_START\n\n @public\n @property\n def is_step_failure(self) -> bool:\n """bool: If this event is of type STEP_FAILURE."""\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @public\n @property\n def is_resource_init_failure(self) -> bool:\n """bool: If this event is of type RESOURCE_INIT_FAILURE."""\n return self.event_type == DagsterEventType.RESOURCE_INIT_FAILURE\n\n @public\n @property\n def is_step_skipped(self) -> bool:\n """bool: If this event is of type STEP_SKIPPED."""\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @public\n @property\n def is_step_up_for_retry(self) -> bool:\n """bool: If this event is of type STEP_UP_FOR_RETRY."""\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @public\n @property\n def is_step_restarted(self) -> bool:\n """bool: If this event is of type STEP_RESTARTED."""\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_job_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_job_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_run_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this event represents the failure of a run or step."""\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_job_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @public\n @property\n def is_engine_event(self) -> bool:\n """bool: If this event is of type ENGINE_EVENT."""\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @public\n @property\n def is_handled_output(self) -> bool:\n """bool: If this event is of type HANDLED_OUTPUT."""\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @public\n @property\n def is_loaded_input(self) -> bool:\n """bool: If this event is of type LOADED_INPUT."""\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @public\n @property\n def is_step_materialization(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @public\n @property\n def is_expectation_result(self) -> bool:\n """bool: If this event is of type STEP_EXPECTATION_RESULT."""\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @public\n @property\n def is_asset_observation(self) -> bool:\n """bool: If this event is of type ASSET_OBSERVATION."""\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @public\n @property\n def is_asset_materialization_planned(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION_PLANNED."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED\n\n @public\n @property\n def asset_key(self) -> Optional[AssetKey]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n asset key. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @public\n @property\n def partition(self) -> Optional[str]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n partition. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def asset_check_planned_data(self) -> "AssetCheckEvaluationPlanned":\n _assert_type(\n "asset_check_planned",\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n self.event_type,\n )\n return cast(AssetCheckEvaluationPlanned, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def materialization(self) -> AssetMaterialization:\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data).materialization\n\n @property\n def asset_check_evaluation_data(self) -> AssetCheckEvaluation:\n _assert_type(\n "asset_check_evaluation", DagsterEventType.ASSET_CHECK_EVALUATION, self.event_type\n )\n return cast(AssetCheckEvaluation, self.event_specific_data)\n\n @property\n def job_failure_data(self) -> "JobFailureData":\n _assert_type("job_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(JobFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type(\n "engine_event_data",\n [\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.STEP_WORKER_STARTING,\n ],\n self.event_type,\n )\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self) -> "ComputeLogsCaptureData":\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return cast(ComputeLogsCaptureData, self.event_specific_data)\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n output_def = step_context.op.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message=(\n 'Yielded output "{output_name}"{mapping_clause} of type'\n ' "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check)."\n ),\n mapping_clause=(\n f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext,\n step_failure_data: "StepFailureData",\n message=None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message=(message or f'Execution of step "{step_context.step.key}" failed.'),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message=(\n 'Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=(\n f" in {step_retry_data.seconds_to_wait} seconds"\n if step_retry_data.seconds_to_wait\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n input_def = step_context.op_def.input_def_named(step_input_data.input_name)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check)."\n ),\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message=f'Started execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message=f'Skipped execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: AssetMaterialization,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization),\n message=(\n materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=f" {materialization.label}" if materialization.label else ""\n )\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def asset_check_evaluation(\n step_context: IStepContext, asset_check_evaluation: AssetCheckEvaluation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_CHECK_EVALUATION,\n step_context=step_context,\n event_specific_data=asset_check_evaluation,\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def job_start(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_START,\n job_context,\n message=f'Started execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_success(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_SUCCESS,\n job_context,\n message=f'Finished execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_failure(\n job_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(job_context_or_name, IPlanContext):\n return DagsterEvent.from_job(\n DagsterEventType.RUN_FAILURE,\n job_context_or_name,\n message=(\n f'Execution of run for "{job_context_or_name.job_name}" failed. {context_msg}'\n ),\n event_specific_data=JobFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the job_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(job_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n job_name=job_context_or_name,\n event_specific_data=JobFailureData(error_info),\n message=f'Execution of run for "{job_context_or_name}" failed. {context_msg}',\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def job_canceled(\n job_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_CANCELED,\n job_context,\n message=f'Execution of run for "{job_context.job_name}" canceled.',\n event_specific_data=JobCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def step_worker_starting(\n step_context: IStepContext,\n message: str,\n metadata: Mapping[str, MetadataValue],\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n DagsterEventType.STEP_WORKER_STARTING,\n step_context,\n message=message,\n event_specific_data=EngineEventData(\n metadata=metadata, marker_start="step_process_start"\n ),\n )\n\n @staticmethod\n def step_worker_started(\n log_manager: DagsterLogManager,\n job_name: str,\n message: str,\n metadata: Mapping[str, MetadataValue],\n step_key: Optional[str],\n ) -> "DagsterEvent":\n event = DagsterEvent(\n DagsterEventType.STEP_WORKER_STARTED.value,\n job_name=job_name,\n message=message,\n event_specific_data=EngineEventData(metadata=metadata, marker_end="step_process_start"),\n pid=os.getpid(),\n step_key=step_key,\n )\n log_manager.log_dagster_event(\n level=logging.DEBUG,\n msg=message,\n dagster_event=event,\n )\n return event\n\n @staticmethod\n def resource_init_start(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_STARTED,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata={}, marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Mapping[str, Any],\n resource_init_times: Mapping[str, str],\n ) -> "DagsterEvent":\n metadata = {}\n for key in resource_instances.keys():\n metadata[key] = MetadataValue.python_artifact(resource_instances[key].__class__)\n metadata[f"{key}:init_time"] = resource_init_times[key]\n\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata=metadata,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_FAILURE,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.ENGINE_EVENT,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n plan_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n if isinstance(plan_context, IStepContext):\n return DagsterEvent.from_step(\n DagsterEventType.ENGINE_EVENT,\n step_context=plan_context,\n event_specific_data=event_specific_data,\n message=message,\n )\n else:\n return DagsterEvent.from_job(\n DagsterEventType.ENGINE_EVENT,\n plan_context,\n message,\n event_specific_data=event_specific_data,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n object_store_name = (\n f"{object_store_operation_result.object_store_name} "\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n f" using {object_store_operation_result.serialization_strategy_name}"\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n f"Stored intermediate object for output {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n f"Retrieved intermediate object for input {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata={"key": MetadataValue.path(object_store_operation_result.key)},\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step "{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Finished the execution of hook "{hook_def.name}" triggered for'\n f' "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Skipped the execution of hook "{hook_def.name}". It did not meet its triggering '\n f'condition during the execution of "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def legacy_compute_log_step_event(step_context: StepExecutionContext):\n step_key = step_context.step.key\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n step_context,\n message=f"Started capturing logs for step: {step_key}.",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=[step_key],\n file_key=step_key,\n ),\n )\n\n @staticmethod\n def capture_logs(\n job_context: IPlanContext,\n step_keys: Sequence[str],\n log_key: Sequence[str],\n log_context: CapturedLogContext,\n ):\n file_key = log_key[-1]\n return DagsterEvent.from_job(\n DagsterEventType.LOGS_CAPTURED,\n job_context,\n message=f"Started capturing logs in process (pid: {os.getpid()}).",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n file_key=file_key,\n external_stdout_url=log_context.external_stdout_url,\n external_stderr_url=log_context.external_stderr_url,\n external_url=log_context.external_url,\n ),\n )
\n\n\ndef get_step_output_event(\n events: Sequence[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.sequence_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", AssetMaterialization),\n ("asset_lineage", Sequence[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: AssetMaterialization,\n asset_lineage: Optional[Sequence[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", AssetMaterialization\n ),\n asset_lineage=check.opt_sequence_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple(\n "_AssetMaterializationPlannedData",\n [("asset_key", AssetKey), ("partition", Optional[str])],\n )\n):\n def __new__(cls, asset_key: AssetKey, partition: Optional[str] = None):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata", Mapping[str, MetadataValue]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def interrupted(steps_interrupted: Sequence[str]) -> "EngineEventData":\n return EngineEventData(\n metadata={"steps_interrupted": MetadataValue.text(str(steps_interrupted))}\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata={}, error=error)\n\n\n@whitelist_for_serdes(storage_name="PipelineFailureData")\nclass JobFailureData(\n NamedTuple(\n "_JobFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(storage_name="PipelineCanceledData")\nclass JobCanceledData(\n NamedTuple(\n "_JobCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(storage_field_names={"file_key": "log_key"})\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("file_key", str), # renamed log_key => file_key to avoid confusion\n ("step_keys", Sequence[str]),\n ("external_url", Optional[str]),\n ("external_stdout_url", Optional[str]),\n ("external_stderr_url", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n file_key: str,\n step_keys: Sequence[str],\n external_url: Optional[str] = None,\n external_stdout_url: Optional[str] = None,\n external_stderr_url: Optional[str] = None,\n ):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n file_key=check.str_param(file_key, "file_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n external_url=check.opt_str_param(external_url, "external_url"),\n external_stdout_url=check.opt_str_param(external_stdout_url, "external_stdout_url"),\n external_stderr_url=check.opt_str_param(external_stderr_url, "external_stderr_url"),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n\n# Old data structures referenced below\n# class AssetStoreOperationData(NamedTuple):\n# op: str\n# step_key: str\n# output_name: str\n# asset_store_key: str\n#\n#\n# class AssetStoreOperationType(Enum):\n# SET_ASSET = "SET_ASSET"\n# GET_ASSET = "GET_ASSET"\n#\n#\n# class PipelineInitFailureData(NamedTuple):\n# error: SerializableErrorInfo\n\n\ndef _handle_back_compat(\n event_type_value: str,\n event_specific_data: Optional[Dict[str, Any]],\n) -> Tuple[str, Optional[Dict[str, Any]]]:\n # transform old specific process events in to engine events\n if event_type_value in [\n "PIPELINE_PROCESS_START",\n "PIPELINE_PROCESS_STARTED",\n "PIPELINE_PROCESS_EXITED",\n ]:\n return "ENGINE_EVENT", {"__class__": "EngineEventData"}\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n assert (\n event_specific_data is not None\n ), "ASSET_STORE_OPERATION event must have specific data"\n if event_specific_data["op"] in (\n "GET_ASSET",\n '{"__enum__": "AssetStoreOperationType.GET_ASSET"}',\n ):\n return (\n "LOADED_INPUT",\n {\n "__class__": "LoadedInputData",\n "input_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n if event_specific_data["op"] in (\n "SET_ASSET",\n '{"__enum__": "AssetStoreOperationType.SET_ASSET"}',\n ):\n return (\n "HANDLED_OUTPUT",\n {\n "__class__": "HandledOutputData",\n "output_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n assert event_specific_data is not None, "STEP_MATERIALIZATION event must have specific data"\n return "ASSET_MATERIALIZATION", event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n assert (\n event_specific_data is not None\n ), "PIPELINE_INIT_FAILURE event must have specific data"\n return "PIPELINE_FAILURE", {\n "__class__": "PipelineFailureData",\n "error": event_specific_data.get("error"),\n }\n\n return event_type_value, event_specific_data\n
", "current_page_name": "_modules/dagster/_core/events", "customsidebar": null, "favicon_url": null, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events.log

\nfrom typing import Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.events import AssetMaterialization, AssetObservation\nfrom dagster._core.events import DagsterEvent, DagsterEventType\nfrom dagster._core.utils import coerce_valid_log_level\nfrom dagster._serdes.serdes import (\n    deserialize_value,\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.error import SerializableErrorInfo\nfrom dagster._utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\n
[docs]@whitelist_for_serdes(\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n old_storage_names={"DagsterEventRecord", "LogMessageRecord", "EventRecord"},\n old_fields={"message": ""},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", PublicAttr[Optional[SerializableErrorInfo]]),\n ("level", PublicAttr[Union[str, int]]),\n ("user_message", PublicAttr[str]),\n ("run_id", PublicAttr[str]),\n ("timestamp", PublicAttr[float]),\n ("step_key", PublicAttr[Optional[str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("dagster_event", PublicAttr[Optional[DagsterEvent]]),\n ],\n )\n):\n """Entries in the event log.\n\n Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n job_name=None,\n dagster_event=None,\n ):\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(job_name, "job_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @public\n @property\n def is_dagster_event(self) -> bool:\n """bool: If this entry contains a DagsterEvent."""\n return bool(self.dagster_event)\n\n
[docs] @public\n def get_dagster_event(self) -> DagsterEvent:\n """DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\n contain a DagsterEvent, an error will be raised.\n """\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event
\n\n def to_json(self):\n return serialize_value(self)\n\n @staticmethod\n def from_json(json_str: str):\n return deserialize_value(json_str, EventLogEntry)\n\n @public\n @property\n def dagster_event_type(self) -> Optional[DagsterEventType]:\n """Optional[DagsterEventType]: The type of the DagsterEvent contained by this entry, if any."""\n return self.dagster_event.event_type if self.dagster_event else None\n\n @public\n @property\n def message(self) -> str:\n """Return the message from the structured DagsterEvent if present, fallback to user_message."""\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION\n ):\n materialization = self.dagster_event.step_materialization_data.materialization\n if isinstance(materialization, AssetMaterialization):\n return materialization\n\n return None\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_OBSERVATION\n ):\n observation = self.dagster_event.asset_observation_data.asset_observation\n if isinstance(observation, AssetObservation):\n return observation\n\n return None\n\n @property\n def tags(self) -> Optional[Mapping[str, str]]:\n materialization = self.asset_materialization\n if materialization:\n return materialization.tags\n\n observation = self.asset_observation\n if observation:\n return observation.tags\n\n return None
\n\n\ndef construct_event_record(logger_message: StructuredLoggerMessage) -> EventLogEntry:\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("job_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """Callback receives a stream of event_records. Piggybacks on the logging machinery."""\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json."""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/events/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.events"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events.log"}, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events"}, "execution": {"api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions import IJob, JobDefinition\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.repository_definition import RepositoryLoadData\nfrom dagster._core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance, InstanceRef\nfrom dagster._core.selector import parse_step_selection\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.telemetry import log_dagster_event, log_repo_stats, telemetry_wrapper\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.interrupts import capture_interrupts\nfrom dagster._utils.merger import merge_dicts\n\nfrom .context_creation_job import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_job_context,\n)\nfrom .job_execution_result import JobExecutionResult\n\nif TYPE_CHECKING:\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new DagsterRun  |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_job                 | ReconstructableJob | sync  | yes         | yes                     |\n# | execute_run_iterator        | DagsterRun         | async | (1)         | no                      |\n# | execute_run                 | DagsterRun         | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the DagsterRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a resolved_op_selection or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                dagster_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if dagster_run.status not in (DagsterRunStatus.NOT_STARTED, DagsterRunStatus.STARTING):\n            if dagster_run.is_finished:\n\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a run worker that started after the run had already finished.",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            elif instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than"\n                        " the run monitor daemon",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n\n                def gen_fail_restarted_run_worker():\n                    yield instance.report_engine_event(\n                        f"{dagster_run.job_name} ({dagster_run.run_id}) started a new"\n                        f" run worker while the run was already in state {dagster_run.status}."\n                        " This most frequently happens when the run worker unexpectedly stops"\n                        " and is restarted by the cluster. Marking the run as failed.",\n                        dagster_run,\n                    )\n                    yield instance.report_run_failed(dagster_run)\n\n                return gen_fail_restarted_run_worker()\n\n    else:\n        check.invariant(\n            dagster_run.status == DagsterRunStatus.STARTED\n            or dagster_run.status == DagsterRunStatus.STARTING,\n            desc=(\n                "Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n                "resuming from a run worker failure".format(\n                    dagster_run.job_name, dagster_run.run_id, dagster_run.status\n                )\n            ),\n        )\n\n    if (\n        dagster_run.resolved_op_selection\n        or dagster_run.asset_selection\n        or dagster_run.asset_check_selection\n    ):\n        # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n            asset_check_selection=dagster_run.asset_check_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=job_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                job=job,\n                execution_plan=execution_plan,\n                dagster_run=dagster_run,\n                instance=instance,\n                run_config=dagster_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> JobExecutionResult:\n    """Executes an existing job run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        job (IJob): The pipeline to execute.\n        dagster_run (DagsterRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        JobExecutionResult: The result of the execution.\n    """\n    if isinstance(job, JobDefinition):\n        raise DagsterInvariantViolationError(\n            "execute_run requires a reconstructable job but received job definition directly"\n            " instead. To support hand-off to other processes please wrap your definition in a call"\n            " to reconstructable(). Learn more about reconstructable here:"\n            " https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            dagster_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        dagster_run.status == DagsterRunStatus.NOT_STARTED\n        or dagster_run.status == DagsterRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            dagster_run.job_name, dagster_run.run_id, dagster_run.status\n        ),\n    )\n    if (\n        dagster_run.resolved_op_selection\n        or dagster_run.asset_selection\n        or dagster_run.asset_check_selection\n    ):\n        # when `execute_run` is directly called, the sub job hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n            asset_check_selection=dagster_run.asset_check_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=job_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            job=job,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            instance=instance,\n            run_config=dagster_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    # We need to reload the run object after execution for it to be accurate\n    reloaded_dagster_run = check.not_none(instance.get_run_by_id(dagster_run.run_id))\n\n    return JobExecutionResult(\n        job.get_definition(),\n        scoped_job_context(\n            execution_plan,\n            job,\n            reloaded_dagster_run.run_config,\n            reloaded_dagster_run,\n            instance,\n        ),\n        event_list,\n        reloaded_dagster_run,\n    )\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n    instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n    if instance:\n        yield instance\n    else:\n        with DagsterInstance.ephemeral() as ephemeral_instance:\n            yield ephemeral_instance\n\n\n
[docs]class ReexecutionOptions(NamedTuple):\n """Reexecution options for python-based execution in Dagster.\n\n Args:\n parent_run_id (str): The run_id of the run to reexecute.\n step_selection (Sequence[str]):\n The list of step selections to reexecute. Must be a subset or match of the\n set of steps executed in the original run. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n """\n\n parent_run_id: str\n step_selection: Sequence[str] = []\n\n @staticmethod\n def from_failure(run_id: str, instance: DagsterInstance) -> "ReexecutionOptions":\n """Creates reexecution options from a failed run.\n\n Args:\n run_id (str): The run_id of the failed run. Run must fail in order to be reexecuted.\n instance (DagsterInstance): The DagsterInstance that the original run occurred in.\n\n Returns:\n ReexecutionOptions: Reexecution options to pass to a python execution.\n """\n from dagster._core.execution.plan.state import KnownExecutionState\n\n parent_run = check.not_none(instance.get_run_by_id(run_id))\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n # Tried to thread through KnownExecutionState to execution plan creation, but little benefit.\n # It is recalculated later by the re-execution machinery.\n step_keys_to_execute, _ = KnownExecutionState.build_resume_retry_reexecution(\n instance, parent_run=cast(DagsterRun, instance.get_run_by_id(run_id))\n )\n return ReexecutionOptions(parent_run_id=run_id, step_selection=step_keys_to_execute)
\n\n\n
[docs]def execute_job(\n job: ReconstructableJob,\n instance: "DagsterInstance",\n run_config: Any = None,\n tags: Optional[Mapping[str, Any]] = None,\n raise_on_error: bool = False,\n op_selection: Optional[Sequence[str]] = None,\n reexecution_options: Optional[ReexecutionOptions] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n """Execute a job synchronously.\n\n This API represents dagster's python entrypoint for out-of-process\n execution. For most testing purposes, :py:meth:`~dagster.JobDefinition.\n execute_in_process` will be more suitable, but when wanting to run\n execution using an out-of-process executor (such as :py:class:`dagster.\n multiprocess_executor`), then `execute_job` is suitable.\n\n `execute_job` expects a persistent :py:class:`DagsterInstance` for\n execution, meaning the `$DAGSTER_HOME` environment variable must be set.\n It also expects a reconstructable pointer to a :py:class:`JobDefinition` so\n that it can be reconstructed in separate processes. This can be done by\n wrapping the ``JobDefinition`` in a call to :py:func:`dagster.\n reconstructable`.\n\n .. code-block:: python\n\n from dagster import DagsterInstance, execute_job, job, reconstructable\n\n @job\n def the_job():\n ...\n\n instance = DagsterInstance.get()\n result = execute_job(reconstructable(the_job), instance=instance)\n assert result.success\n\n\n If using the :py:meth:`~dagster.GraphDefinition.to_job` method to\n construct the ``JobDefinition``, then the invocation must be wrapped in a\n module-scope function, which can be passed to ``reconstructable``.\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def the_graph():\n ...\n\n def define_job():\n return the_graph.to_job(...)\n\n result = execute_job(reconstructable(define_job), ...)\n\n Since `execute_job` is potentially executing outside of the current\n process, output objects need to be retrieved by use of the provided job's\n io managers. Output objects can be retrieved by opening the result of\n `execute_job` as a context manager.\n\n .. code-block:: python\n\n from dagster import execute_job\n\n with execute_job(...) as result:\n output_obj = result.output_for_node("some_op")\n\n ``execute_job`` can also be used to reexecute a run, by providing a :py:class:`ReexecutionOptions` object.\n\n .. code-block:: python\n\n from dagster import ReexecutionOptions, execute_job\n\n instance = DagsterInstance.get()\n\n options = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\n execute_job(reconstructable(job), instance, reexecution_options=options)\n\n Parameters:\n job (ReconstructableJob): A reconstructable pointer to a :py:class:`JobDefinition`.\n instance (DagsterInstance): The instance to execute against.\n run_config (Optional[dict]): The configuration that parametrizes this run, as a dict.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to run logs.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``False``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single\n op names) to execute. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n reexecution_options (Optional[ReexecutionOptions]):\n Reexecution options to provide to the run, if this run is\n intended to be a reexecution of a previous run. Cannot be used in\n tandem with the ``op_selection`` argument.\n\n Returns:\n :py:class:`JobExecutionResult`: The result of job execution.\n """\n check.inst_param(job, "job", ReconstructableJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.opt_sequence_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # get the repository load data here because we call job.get_definition() later in this fn\n job_def, _ = _job_with_repository_load_data(job)\n\n if reexecution_options is not None and op_selection is not None:\n raise DagsterInvariantViolationError(\n "re-execution and op selection cannot be used together at this time."\n )\n\n if reexecution_options:\n if run_config is None:\n run = check.not_none(instance.get_run_by_id(reexecution_options.parent_run_id))\n run_config = run.run_config\n return _reexecute_job(\n job_arg=job_def,\n parent_run_id=reexecution_options.parent_run_id,\n run_config=run_config,\n step_selection=list(reexecution_options.step_selection),\n tags=tags,\n instance=instance,\n raise_on_error=raise_on_error,\n )\n else:\n return _logged_execute_job(\n job_arg=job_def,\n instance=instance,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n raise_on_error=raise_on_error,\n asset_selection=asset_selection,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_job(\n job_arg: Union[IJob, JobDefinition],\n instance: DagsterInstance,\n run_config: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n raise_on_error: bool = True,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (\n job_arg,\n run_config,\n tags,\n resolved_op_selection,\n op_selection,\n ) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n )\n\n log_repo_stats(instance=instance, job=job_arg, source="execute_pipeline")\n\n dagster_run = instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n tags=tags,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\ndef _reexecute_job(\n job_arg: Union[IJob, JobDefinition],\n parent_run_id: str,\n run_config: Optional[Mapping[str, object]] = None,\n step_selection: Optional[Sequence[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> JobExecutionResult:\n """Reexecute an existing job run."""\n check.opt_sequence_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (job_arg, run_config, tags, _, _) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n )\n\n parent_dagster_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_dagster_run is None:\n check.failed(\n f"No parent run with id {parent_run_id} found in instance.",\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n job_arg,\n run_config,\n cast(DagsterRun, parent_dagster_run),\n step_selection,\n )\n\n if parent_dagster_run.asset_selection:\n job_arg = job_arg.get_subset(\n op_selection=None, asset_selection=parent_dagster_run.asset_selection\n )\n\n dagster_run = execute_instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n tags=tags,\n op_selection=parent_dagster_run.op_selection,\n asset_selection=parent_dagster_run.asset_selection,\n resolved_op_selection=parent_dagster_run.resolved_op_selection,\n root_run_id=parent_dagster_run.root_run_id or parent_dagster_run.run_id,\n parent_run_id=parent_dagster_run.run_id,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )\n check.failed("Should not reach here.")\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[Mapping[str, object]] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n if isinstance(job, ReconstructableJob):\n job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n job=job,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n job: IJob,\n instance: DagsterInstance,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> Sequence[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_job() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n run_config = check.opt_mapping_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n job=job,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _get_execution_plan_from_run(\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n) -> ExecutionPlan:\n execution_plan_snapshot = (\n instance.get_execution_plan_snapshot(dagster_run.execution_plan_snapshot_id)\n if dagster_run.execution_plan_snapshot_id\n else None\n )\n\n # Rebuild from snapshot if able and selection has not changed\n if (\n execution_plan_snapshot is not None\n and execution_plan_snapshot.can_reconstruct_plan\n and job.resolved_op_selection == dagster_run.resolved_op_selection\n and job.asset_selection == dagster_run.asset_selection\n and job.asset_check_selection == dagster_run.asset_check_selection\n ):\n return ExecutionPlan.rebuild_from_snapshot(\n dagster_run.job_name,\n execution_plan_snapshot,\n )\n\n return create_execution_plan(\n job,\n run_config=dagster_run.run_config,\n step_keys_to_execute=dagster_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n repository_load_data=(\n execution_plan_snapshot.repository_load_data if execution_plan_snapshot else None\n ),\n known_state=(\n execution_plan_snapshot.initial_known_state if execution_plan_snapshot else None\n ),\n )\n\n\ndef create_execution_plan(\n job: Union[IJob, JobDefinition],\n run_config: Optional[Mapping[str, object]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Mapping[str, str]] = None,\n repository_load_data: Optional[RepositoryLoadData] = None,\n) -> ExecutionPlan:\n if isinstance(job, IJob):\n # If you have repository_load_data, make sure to use it when building plan\n if isinstance(job, ReconstructableJob) and repository_load_data is not None:\n job = job.with_repository_load_data(repository_load_data)\n job_def = job.get_definition()\n else:\n job_def = job\n\n run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n known_state = check.opt_inst_param(\n known_state,\n "known_state",\n KnownExecutionState,\n default=KnownExecutionState(),\n )\n repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n resolved_run_config = ResolvedRunConfig.build(job_def, run_config)\n\n return ExecutionPlan.build(\n job_def,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n\ndef job_execution_iterator(\n job_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n # TODO: restart event?\n if not job_context.resume_from_failure:\n yield DagsterEvent.job_start(job_context)\n\n job_exception_info = None\n job_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in job_context.executor.execute(job_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n elif event.is_resource_init_failure and event.step_key:\n failed_steps.append(event.step_key)\n\n # Telemetry\n log_dagster_event(event, job_context)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n job_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except BaseException:\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if job_canceled_info:\n reloaded_run = job_context.instance.get_run_by_id(job_context.run_id)\n if reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELING:\n event = DagsterEvent.job_canceled(job_context, job_canceled_info)\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n job_context,\n "Computational resources were cleaned up after the run was forcibly marked"\n " as canceled.",\n EngineEventData(),\n )\n elif job_context.instance.run_will_resume(job_context.run_id):\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted unexpectedly. No user initiated termination"\n " request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.FAILURE:\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted for a run that was already in a failure state.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.job_failure(\n job_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n job_canceled_info,\n )\n elif job_exception_info:\n event = DagsterEvent.job_failure(\n job_context,\n "An exception was thrown during execution.",\n job_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.job_failure(\n job_context,\n f"Steps failed: {failed_steps}.",\n )\n else:\n event = DagsterEvent.job_success(job_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `JobExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster._utils.EventGenerationManager`.\n """\n\n def __init__(\n self,\n execution_plan: ExecutionPlan,\n iterator: Callable[..., Iterator[DagsterEvent]],\n execution_context_manager: ExecutionContextManager[Any],\n ):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.job_context = None\n\n def __iter__(self) -> Iterator[DagsterEvent]:\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.job_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.job_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n job_context=self.job_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_job_args(\n job_arg: Union[JobDefinition, IJob],\n run_config: Optional[Mapping[str, object]],\n tags: Optional[Mapping[str, str]],\n op_selection: Optional[Sequence[str]] = None,\n) -> Tuple[\n IJob,\n Optional[Mapping],\n Mapping[str, str],\n Optional[AbstractSet[str]],\n Optional[Sequence[str]],\n]:\n ijob = InMemoryJob(job_arg) if isinstance(job_arg, JobDefinition) else job_arg\n job_def = job_arg if isinstance(job_arg, JobDefinition) else job_arg.get_definition()\n\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n\n tags = merge_dicts(job_def.tags, tags)\n\n # generate job subset from the given op_selection\n if op_selection:\n ijob = ijob.get_subset(op_selection=op_selection)\n\n return (\n ijob,\n run_config,\n tags,\n ijob.resolved_op_selection,\n op_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n job: IJob,\n run_config: Optional[Mapping],\n parent_dagster_run: DagsterRun,\n step_selection: Sequence[str],\n) -> ExecutionPlan:\n if parent_dagster_run.op_selection:\n job = job.get_subset(op_selection=parent_dagster_run.op_selection)\n\n state = KnownExecutionState.build_for_reexecution(instance, parent_dagster_run)\n\n parent_plan = create_execution_plan(\n job,\n parent_dagster_run.run_config,\n known_state=state,\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n job,\n run_config,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=state.update_for_step_selection(step_keys_to_execute),\n tags=parent_dagster_run.tags,\n )\n return execution_plan\n\n\ndef _job_with_repository_load_data(\n job_arg: Union[JobDefinition, IJob],\n) -> Tuple[Union[JobDefinition, IJob], Optional[RepositoryLoadData]]:\n """For ReconstructableJob, generate and return any required RepositoryLoadData, alongside\n a ReconstructableJob with this repository load data baked in.\n """\n if isinstance(job_arg, ReconstructableJob):\n # Unless this ReconstructableJob alread has repository_load_data attached, this will\n # force the repository_load_data to be computed from scratch.\n repository_load_data = job_arg.repository.get_definition().repository_load_data\n return job_arg.with_repository_load_data(repository_load_data), repository_load_data\n return job_arg, None\n
", "current_page_name": "_modules/dagster/_core/execution/api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.api"}, "build_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Mapping, Optional, cast\n\nimport dagster._check as check\nfrom dagster._config import process_config\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.resources_init import resource_initialization_manager\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_job import initialize_console_manager\n\n\ndef get_mapped_resource_config(\n    resource_defs: Mapping[str, ResourceDefinition], resource_config: Mapping[str, Any]\n) -> Mapping[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Mapping[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, dagster_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Mapping[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Mapping[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n dagster_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `dagster_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n resources = check.mapping_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(dagster_run),\n execution_plan=None,\n dagster_run=dagster_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Mapping[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n return (\n {\n resource_key: wrap_resource_for_execution(resource)\n for resource_key, resource in resources.items()\n }\n if resources\n else {}\n )\n\n\ndef wrap_resource_for_execution(resource: Any) -> ResourceDefinition:\n from dagster._config.pythonic_config import ConfigurableResourceFactory, PartialResource\n\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n if isinstance(resource, (ConfigurableResourceFactory, PartialResource)):\n return resource.get_resource_definition()\n elif isinstance(resource, ResourceDefinition):\n return resource\n elif isinstance(resource, IOManager):\n return IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n return ResourceDefinition.hardcoded_resource(resource)\n
", "current_page_name": "_modules/dagster/_core/execution/build_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.compute

\nfrom abc import ABC, ABCMeta, abstractmethod\nfrom inspect import _empty as EmptyAnnotation\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.data_version import (\n    DataProvenance,\n    DataVersion,\n    extract_data_provenance_from_entry,\n)\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n)\n\nfrom .system import StepExecutionContext\n\n\n# This metaclass has to exist for OpExecutionContext to have a metaclass\nclass AbstractComputeMetaclass(ABCMeta):\n    pass\n\n\nclass AbstractComputeExecutionContext(ABC, metaclass=AbstractComputeMetaclass):\n    """Base class for op context implemented by OpExecutionContext and DagstermillExecutionContext."""\n\n    @abstractmethod\n    def has_tag(self, key: str) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def op_def(self) -> OpDefinition:\n        """The op definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def job_def(self) -> JobDefinition:\n        """The job being executed."""\n\n    @property\n    @abstractmethod\n    def run(self) -> DagsterRun:\n        """The DagsterRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def op_config(self) -> Any:\n        """The parsed config specific to this op."""\n\n\nclass OpExecutionContextMetaClass(AbstractComputeMetaclass):\n    def __instancecheck__(cls, instance) -> bool:\n        # This makes isinstance(context, OpExecutionContext) throw a deprecation warning when\n        # context is an AssetExecutionContext. This metaclass can be deleted once AssetExecutionContext\n        # has been split into it's own class in 1.7.0\n        if type(instance) is AssetExecutionContext and cls is not AssetExecutionContext:\n            deprecation_warning(\n                subject="AssetExecutionContext",\n                additional_warn_text=(\n                    "Starting in version 1.7.0 AssetExecutionContext will no longer be a subclass"\n                    " of OpExecutionContext."\n                ),\n                breaking_version="1.7.0",\n                stacklevel=1,\n            )\n        return super().__instancecheck__(instance)\n\n\n
[docs]class OpExecutionContext(AbstractComputeExecutionContext, metaclass=OpExecutionContextMetaClass):\n """The ``context`` object that can be made available as the first argument to the function\n used for computing an op or asset.\n\n This context object provides system information such as resources, config, and logging.\n\n To construct an execution context for testing purposes, use :py:func:`dagster.build_op_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import op, OpExecutionContext\n\n @op\n def hello_world(context: OpExecutionContext):\n context.log.info("Hello, world!")\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @public\n @property\n def op_config(self) -> Any:\n """Any: The parsed config specific to this op."""\n return self._step_execution_context.op_config\n\n @property\n def dagster_run(self) -> DagsterRun:\n """PipelineRun: The current pipeline run."""\n return self._step_execution_context.dagster_run\n\n @property\n def run(self) -> DagsterRun:\n """DagsterRun: The current run."""\n return self.dagster_run\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance."""\n return self._step_execution_context.instance\n\n @public\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in"\n " 0.10.0. Please access it via `context.resources.file_manager` instead."\n )\n\n @public\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, object]:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The currently executing pipeline."""\n return self._step_execution_context.job_def\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def node_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self._step_execution_context.node_handle\n\n @property\n def op_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self.node_handle\n\n @property\n def op(self) -> Node:\n """Node: The object representing the invoked op within the graph.\n\n :meta private:\n\n """\n return self._step_execution_context.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """OpDefinition: The current op definition."""\n return cast(OpDefinition, self.op.definition)\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._step_execution_context.has_partition_key\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run. Or if the current run is operating\n over a range of partitions (ie. a backfill of several partitions executed in a single run).\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n """\n return self._step_execution_context.partition_key\n\n @deprecated(breaking_version="2.0", additional_warn_text="Use `partition_key_range` instead.")\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n """\n return self.partition_key_range\n\n @public\n @property\n def partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, returns a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key_range)\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n """\n return self._step_execution_context.asset_partition_key_range\n\n @public\n @property\n def partition_time_window(self) -> TimeWindow:\n """The partition time window for the current run.\n\n Raises an error if the current run is not a partitioned run, or if the job's partition\n definition is not a TimeWindowPartitionsDefinition.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_time_window)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n """\n return self._step_execution_context.partition_time_window\n\n
[docs] @public\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] @public\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n @property\n def run_tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for the current run."""\n return self._step_execution_context.run_tags\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n
[docs] @public\n def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n if isinstance(event, AssetMaterialization):\n self._events.append(\n DagsterEvent.asset_materialization(self._step_execution_context, event)\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed(f"Unexpected event {event}")
\n\n
[docs] @public\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can be invoked multiple times per output in the body of an op. If the same key is\n passed multiple times, the value associated with the last call will be used.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n mapping_key (Optional[str]): The mapping key of the output to attach metadata to. If the\n output is not dynamic, this argument does not need to be provided.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @public\n @property\n def retry_number(self) -> int:\n """Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc."""\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] @public\n def get_mapping_key(self) -> Optional[str]:\n """Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None."""\n return self._step_execution_context.step.get_mapping_key()
\n\n #############################################################################################\n # asset related methods\n #############################################################################################\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead."""\n if self.has_assets_def and len(self.assets_def.keys_by_output_name.keys()) > 1:\n raise DagsterInvariantViolationError(\n "Cannot call `context.asset_key` in a multi_asset with more than one asset. Use"\n " `context.asset_key_for_output` instead."\n )\n return next(iter(self.assets_def.keys_by_output_name.values()))\n\n @public\n @property\n def has_assets_def(self) -> bool:\n """If there is a backing AssetsDefinition for what is currently executing."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n return assets_def is not None\n\n @public\n @property\n def assets_def(self) -> AssetsDefinition:\n """The backing AssetsDefinition for what is currently executing, errors if not available."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n if assets_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an assets definition."\n )\n return assets_def\n\n @public\n @property\n def selected_asset_keys(self) -> AbstractSet[AssetKey]:\n """Get the set of AssetKeys this execution is expected to materialize."""\n if not self.has_assets_def:\n return set()\n return self.assets_def.keys\n\n @public\n @property\n def has_asset_checks_def(self) -> bool:\n """Return a boolean indicating the presence of a backing AssetChecksDefinition\n for the current execution.\n\n Returns:\n bool: True if there is a backing AssetChecksDefinition for the current execution, otherwise False.\n """\n return self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle) is not None\n\n @public\n @property\n def asset_checks_def(self) -> AssetChecksDefinition:\n """The backing AssetChecksDefinition for what is currently executing, errors if not\n available.\n\n Returns:\n AssetChecksDefinition.\n """\n asset_checks_def = self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle)\n if asset_checks_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an asset checks definition."\n )\n\n return asset_checks_def\n\n @public\n @property\n def selected_asset_check_keys(self) -> AbstractSet[AssetCheckKey]:\n if self.has_assets_def:\n return self.assets_def.check_keys\n\n if self.has_asset_checks_def:\n check.failed("Subset selection is not yet supported within an AssetChecksDefinition")\n\n return set()\n\n @public\n @property\n def selected_output_names(self) -> AbstractSet[str]:\n """Get the output names that correspond to the current selection of assets this execution is expected to materialize."""\n # map selected asset keys to the output names they correspond to\n selected_asset_keys = self.selected_asset_keys\n selected_outputs: Set[str] = set()\n for output_name in self.op.output_dict.keys():\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output_name\n )\n if any( # For graph-backed assets, check if a downstream asset is selected\n [\n asset_key in selected_asset_keys\n for asset_key in self.job_def.asset_layer.downstream_dep_assets(\n self.node_handle, output_name\n )\n ]\n ) or (asset_info and asset_info.key in selected_asset_keys):\n selected_outputs.add(output_name)\n\n return selected_outputs\n\n
[docs] @public\n def asset_key_for_output(self, output_name: str = "result") -> AssetKey:\n """Return the AssetKey for the corresponding output."""\n asset_output_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.op_handle, output_name=output_name\n )\n if asset_output_info is None:\n check.failed(f"Output '{output_name}' has no asset")\n else:\n return asset_output_info.key
\n\n
[docs] @public\n def output_for_asset_key(self, asset_key: AssetKey) -> str:\n """Return the output name for the corresponding asset key."""\n node_output_handle = self.job_def.asset_layer.node_output_handle_for_asset(asset_key)\n if node_output_handle is None:\n check.failed(f"Asset key '{asset_key}' has no output")\n else:\n return node_output_handle.output_name
\n\n
[docs] @public\n def asset_key_for_input(self, input_name: str) -> AssetKey:\n """Return the AssetKey for the corresponding input."""\n key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.op_handle, input_name=input_name\n )\n if key is None:\n check.failed(f"Input '{input_name}' has no asset")\n else:\n return key
\n\n
[docs] @public\n def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output("first_asset"))\n context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] @public\n def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_output`` to get the TimeWindow of all of the partitions\n being materialized by the backfill.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_output(\n self, output_name: str = "result"\n ) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key range for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n """\n return self._step_execution_context.asset_partition_key_range_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\n non-contiguous chunk of the input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_input`` to get the range of partitions keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n """\n return self._step_execution_context.asset_partition_key_range_for_input(input_name)
\n\n
[docs] @public\n def asset_partition_key_for_input(self, input_name: str) -> str:\n """Returns the partition key of the upstream asset corresponding to the given input.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-20"\n\n """\n return self._step_execution_context.asset_partition_key_for_input(input_name)
\n\n
[docs] @public\n def asset_partitions_def_for_output(self, output_name: str = "result") -> PartitionsDefinition:\n """The PartitionsDefinition on the asset corresponding to this output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output("first_asset"))\n context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_output(output_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partitions_def_for_input(self, input_name: str) -> PartitionsDefinition:\n """The PartitionsDefinition on the upstream asset corresponding to this input.\n\n Args:\n input_name (str): The name of the input to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_input(input_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partition_keys_for_output(self, output_name: str = "result") -> Sequence[str]:\n """Returns a list of the partition keys for the given output.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition keys for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output("first_asset"))\n context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n """\n return self.asset_partitions_def_for_output(output_name).get_partition_keys_in_range(\n self._step_execution_context.asset_partition_key_range_for_output(output_name),\n dynamic_partitions_store=self.instance,\n )
\n\n
[docs] @public\n def asset_partition_keys_for_input(self, input_name: str) -> Sequence[str]:\n """Returns a list of the partition keys of the upstream asset corresponding to the\n given input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_input`` to get all of the partition keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n """\n return list(\n self._step_execution_context.asset_partitions_subset_for_input(\n input_name\n ).get_partition_keys()\n )
\n\n
[docs] @public\n def asset_partitions_time_window_for_input(self, input_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_input`` to get the time window of the input that\n are relevant to that backfill.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-25")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_input(input_name)
\n\n
[docs] @public\n @experimental\n def get_asset_provenance(self, asset_key: AssetKey) -> Optional[DataProvenance]:\n """Return the provenance information for the most recent materialization of an asset.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to retrieve provenance.\n\n Returns:\n Optional[DataProvenance]: Provenance information for the most recent\n materialization of the asset. Returns `None` if the asset was never materialized or\n the materialization record is too old to contain provenance information.\n """\n record = self.instance.get_latest_data_version_record(asset_key)\n\n return (\n None if record is None else extract_data_provenance_from_entry(record.event_log_entry)\n )
\n\n def set_data_version(self, asset_key: AssetKey, data_version: DataVersion) -> None:\n """Set the data version for an asset being materialized by the currently executing step.\n This is useful for external execution situations where it is not possible to return\n an `Output`.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to set the data version.\n data_version (DataVersion): The data version to set.\n """\n self._step_execution_context.set_data_version(asset_key, data_version)\n\n @property\n def asset_check_spec(self) -> AssetCheckSpec:\n asset_checks_def = check.not_none(\n self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle),\n "This context does not correspond to an AssetChecksDefinition",\n )\n return asset_checks_def.spec\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._step_execution_context.requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._step_execution_context.typed_event_stream_error_message\n\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None) -> None:\n self._step_execution_context.set_requires_typed_event_stream(error_message=error_message)
\n\n\n
[docs]class AssetExecutionContext(OpExecutionContext):\n def __init__(self, step_execution_context: StepExecutionContext):\n super().__init__(step_execution_context=step_execution_context)
\n\n\ndef build_execution_context(\n step_context: StepExecutionContext,\n) -> Union[OpExecutionContext, AssetExecutionContext]:\n """Get the correct context based on the type of step (op or asset) and the user provided context\n type annotation. Follows these rules.\n\n step type annotation result\n asset AssetExecutionContext AssetExecutionContext\n asset OpExecutionContext OpExecutionContext\n asset None AssetExecutionContext\n op AssetExecutionContext Error - we cannot init an AssetExecutionContext w/o an AssetsDefinition\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n For ops in graph-backed assets\n step type annotation result\n op AssetExecutionContext AssetExecutionContext\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n """\n is_sda_step = step_context.is_sda_step\n is_op_in_graph_asset = is_sda_step and step_context.is_op_in_graph\n context_annotation = EmptyAnnotation\n compute_fn = step_context.op_def._compute_fn # noqa: SLF001\n compute_fn = (\n compute_fn\n if isinstance(compute_fn, DecoratedOpFunction)\n else DecoratedOpFunction(compute_fn)\n )\n if compute_fn.has_context_arg():\n context_param = compute_fn.get_context_arg()\n context_annotation = context_param.annotation\n\n # It would be nice to do this check at definition time, rather than at run time, but we don't\n # know if the op is part of an op job or a graph-backed asset until we have the step execution context\n if context_annotation is AssetExecutionContext and not is_sda_step:\n # AssetExecutionContext requires an AssetsDefinition during init, so an op in an op job\n # cannot be annotated with AssetExecutionContext\n raise DagsterInvalidDefinitionError(\n "Cannot annotate @op `context` parameter with type AssetExecutionContext unless the"\n " op is part of a graph-backed asset. `context` must be annotated with"\n " OpExecutionContext, or left blank."\n )\n\n if context_annotation is EmptyAnnotation:\n # if no type hint has been given, default to:\n # * AssetExecutionContext for sda steps, not in graph-backed assets\n # * OpExecutionContext for non sda steps\n # * OpExecutionContext for ops in graph-backed assets\n if is_op_in_graph_asset or not is_sda_step:\n return OpExecutionContext(step_context)\n return AssetExecutionContext(step_context)\n if context_annotation is AssetExecutionContext:\n return AssetExecutionContext(step_context)\n return OpExecutionContext(step_context)\n
", "current_page_name": "_modules/dagster/_core/execution/context/compute", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.hook

\nimport warnings\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, Mapping, Optional, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom ..plan.utils import RetryRequestedFromPolicy\nfrom .system import StepExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object.\n    """\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent."""\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @public\n @property\n def job_name(self) -> str:\n """The name of the job where this hook is being triggered."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run where this hook is being triggered."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def hook_def(self) -> HookDefinition:\n """The hook that the context object belongs to."""\n return self._hook_def\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """The instance configured to run the current job."""\n return self._step_execution_context.instance\n\n @property\n def op(self) -> Node:\n """The op instance associated with the hook."""\n return self._step_execution_context.op\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @public\n @property\n def step_key(self) -> str:\n """The key for the step where this hook is being triggered."""\n return self._step_execution_context.step.key\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """Resources required by this hook."""\n return self._required_resource_keys\n\n @public\n @property\n def resources(self) -> "Resources":\n """Resources available in the hook context."""\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.ops.get(\n str(self._step_execution_context.step.node_handle)\n )\n return solid_config.config if solid_config else None\n\n @public\n @property\n def op_config(self) -> Any:\n """The parsed config specific to this op."""\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @public\n @property\n def op_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed op."""\n exc = self._step_execution_context.step_exception\n\n if isinstance(exc, RetryRequestedFromPolicy):\n return exc.__cause__\n\n return exc\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @public\n @property\n def op_output_values(self):\n """Computed output values in an op."""\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Mapping[str, Any],\n op: Optional[Union[OpDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n from ..build_resources import build_resources, wrap_resources_for_execution\n from ..context_creation_job import initialize_console_manager\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.nodes[0]\n\n # Open resource context manager\n self._resource_defs = wrap_resources_for_execution(resources)\n self._resources_cm = build_resources(self._resource_defs)\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc: Any):\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\n
[docs]def build_hook_context(\n resources: Optional[Mapping[str, Any]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n instance: Optional["DagsterInstance"] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n instance (Optional[DagsterInstance]): The Dagster instance configured to run the hook.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n\n from dagster._core.instance import DagsterInstance\n\n return UnboundHookContext(\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/hook", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.hook"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.init

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\n\n
[docs]class InitResourceContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.ResourceDefinition`.\n\n Users should not instantiate this object directly. To construct an `InitResourceContext` for testing purposes, use :py:func:`dagster.build_init_resource_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import resource, InitResourceContext\n\n @resource\n def the_resource(init_context: InitResourceContext):\n init_context.log.info("Hello, world!")\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n ):\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n self._dagster_run = dagster_run\n\n @public\n @property\n def resource_config(self) -> Any:\n """The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n """\n return self._resource_config\n\n @public\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n """The definition of the resource currently being constructed."""\n return self._resource_def\n\n @public\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n return self._resources\n\n @public\n @property\n def instance(self) -> Optional[DagsterInstance]:\n """The Dagster instance configured for the current execution context."""\n return self._instance\n\n @property\n def dagster_run(self) -> Optional[DagsterRun]:\n """The dagster run to use. When initializing resources outside of execution context, this will be None."""\n return self._dagster_run\n\n @public\n @property\n def log(self) -> Optional[DagsterLogManager]:\n """The Dagster log manager configured for the current execution context."""\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @public\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n """The log manager for this run of the job."""\n return self._log_manager\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The id for this run of the job or pipeline. When initializing resources outside of\n execution context, this will be None.\n """\n return self.dagster_run.run_id if self.dagster_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n dagster_run=self.dagster_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Mapping[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster._core.execution.api import ephemeral_instance_if_missing\n from dagster._core.execution.build_resources import (\n build_resources,\n wrap_resources_for_execution,\n )\n from dagster._core.execution.context_creation_job import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__()\n\n if isinstance(resources, Resources):\n check.failed("Should not have a Resources object directly from this initialization")\n\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resources, "resources")\n )\n\n self._resources_cm = build_resources(self._resource_defs, instance=instance)\n resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n dagster_run=None,\n log_manager=initialize_console_manager(None),\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc)\n if self._instance_provided:\n self._instance_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None)\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_mapping_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.init"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.input

\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n)\nfrom dagster._core.definitions.partition import PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow, TimeWindowPartitionsSubset\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """The ``context`` object available to the load_input method of :py:class:`InputManager`.\n\n Users should not instantiate this object directly. In order to construct\n an `InputContext` for testing an IO Manager's `load_input` method, use\n :py:func:`dagster.build_input_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, InputContext\n\n class MyIOManager(IOManager):\n def load_input(self, context: InputContext):\n ...\n """\n\n def __init__(\n self,\n *,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n op_def: Optional["OpDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Union["Resources", Mapping[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[AssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partitions_subset: Optional[PartitionsSubset] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._name = name\n self._job_name = job_name\n self._op_def = op_def\n self._config = config\n self._metadata = metadata or {}\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_key = asset_key\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n self._asset_partitions_subset = asset_partitions_subset\n self._asset_partitions_def = asset_partitions_def\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._instance = instance\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def instance(self) -> DagsterInstance:\n if self._instance is None:\n raise DagsterInvariantViolationError(\n "Attempting to access instance, "\n "but it was not provided when constructing the InputContext"\n )\n return self._instance\n\n @public\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name.\n """\n return self._name is not None\n\n @public\n @property\n def name(self) -> str:\n """The name of the input that we're loading."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access job_name, "\n "but it was not provided when constructing the InputContext"\n )\n return self._job_name\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that's loading the input."""\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._op_def\n\n @public\n @property\n def config(self) -> Any:\n """The config attached to the input that we're loading."""\n return self._config\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of metadata that is assigned to the InputDefinition that we're loading for.\n This property only contains metadata passed in explicitly with :py:class:`AssetIn`\n or :py:class:`In`. To access metadata of an upstream asset or operation definition,\n use the metadata in :py:attr:`.InputContext.upstream_output`.\n """\n return self._metadata\n\n @public\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n """Info about the output that produced the object we're loading."""\n return self._upstream_output\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this input.\n Dagster types do not propagate from an upstream output to downstream inputs,\n and this property only captures type information for the input that is either\n passed in explicitly with :py:class:`AssetIn` or :py:class:`In`, or can be\n infered from type hints. For an asset input, the Dagster type from the upstream\n asset definition is ignored.\n """\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this input."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, Any]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the resource that initializes the\n input manager. If using the :py:func:`@input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\n indicates that an output from an op is being loaded as the input.\n """\n return self._asset_key is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being loaded as an input."""\n if self._asset_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, but no asset is associated with this input"\n )\n\n return self._asset_key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the upstream asset corresponding to this input."""\n if self._asset_partitions_def is None:\n if self.asset_key:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {self.asset_key}, but it is not"\n " partitioned"\n )\n else:\n raise DagsterInvariantViolationError(\n "Attempting to access partitions def for asset, but input does not correspond"\n " to an asset"\n )\n\n return self._asset_partitions_def\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being loaded as input is partitioned."""\n return self._asset_partitions_subset is not None\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed("The input does not correspond to a partitioned asset.")\n\n partition_keys = list(subset.get_partition_keys())\n if len(partition_keys) == 1:\n return partition_keys[0]\n else:\n check.failed(\n f"Tried to access partition key for asset '{self.asset_key}', "\n f"but the number of input partitions != 1: '{subset}'."\n )\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partition_key_range, but the asset is not partitioned.",\n )\n\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset_partition_key_range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n if self._asset_partitions_subset is None:\n check.failed(\n "Tried to access asset_partition_keys, but the asset is not partitioned.",\n )\n\n return list(self._asset_partitions_subset.get_partition_keys())\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned.",\n )\n\n if not isinstance(subset, TimeWindowPartitionsSubset):\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned"\n " with time windows.",\n )\n\n time_windows = subset.included_time_windows\n if len(time_windows) != 1:\n check.failed(\n "Tried to access asset_partitions_time_window, but there are "\n f"({len(time_windows)}) time windows associated with this input.",\n )\n\n return time_windows[0]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step input.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the input.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the input is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n if self.upstream_output is None:\n raise DagsterInvariantViolationError(\n "InputContext.upstream_output not defined. Cannot compute an identifier"\n )\n\n return self.upstream_output.get_identifier()
\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being loaded as an input.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])``, materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset identifier for an input with no asset key")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def add_input_metadata(\n self,\n metadata: Mapping[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster._core.definitions.metadata import normalize_metadata\n from dagster._core.events import DagsterEvent\n\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n self._metadata = {**self._metadata, **normalize_metadata(metadata)}\n if self.has_asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))\n\n def get_observations(\n self,\n ) -> Sequence[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations\n\n def consume_metadata(self) -> Mapping[str, MetadataValue]:\n result = self._metadata\n self._metadata = {}\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n op_def: Optional["OpDefinition"] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partition_key_range: Optional[PartitionKeyRange] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[Union[AssetKey, Sequence[str], str]]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n partition_key (Optional[str]): String value representing partition key to execute with.\n asset_partition_key_range (Optional[str]): The range of asset partition keys to load.\n asset_partitions_def: Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\n being loaded.\n\n Examples:\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster._core.definitions import OpDefinition, PartitionsDefinition\n from dagster._core.execution.context.output import OutputContext\n from dagster._core.execution.context.system import StepExecutionContext\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n asset_partition_key_range = check.opt_inst_param(\n asset_partition_key_range, "asset_partition_key_range", PartitionKeyRange\n )\n asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partitions_def", PartitionsDefinition\n )\n if asset_partitions_def and asset_partition_key_range:\n asset_partitions_subset = asset_partitions_def.empty_subset().with_partition_key_range(\n asset_partition_key_range, dynamic_partitions_store=instance\n )\n elif asset_partition_key_range:\n asset_partitions_subset = KeyRangeNoPartitionsDefPartitionsSubset(asset_partition_key_range)\n else:\n asset_partitions_subset = None\n\n return InputContext(\n name=name,\n job_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n asset_key=asset_key,\n partition_key=partition_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=instance,\n )
\n\n\nclass KeyRangeNoPartitionsDefPartitionsSubset(PartitionsSubset):\n """For build_input_context when no PartitionsDefinition has been provided."""\n\n def __init__(self, key_range: PartitionKeyRange):\n self._key_range = key_range\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n raise NotImplementedError()\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._key_range.start == self._key_range.end:\n return self._key_range.start\n else:\n raise NotImplementedError()\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [self._key_range]\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def serialize(self) -> str:\n raise NotImplementedError()\n\n @property\n def partitions_def(self) -> "PartitionsDefinition":\n raise NotImplementedError()\n\n def __len__(self) -> int:\n raise NotImplementedError()\n\n def __contains__(self, value) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def from_serialized(\n cls, partitions_def: "PartitionsDefinition", serialized: str\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: "PartitionsDefinition",\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def empty_subset(cls, partitions_def: "PartitionsDefinition") -> "PartitionsSubset":\n raise NotImplementedError()\n
", "current_page_name": "_modules/dagster/_core/execution/context/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.invocation

\nfrom contextlib import ExitStack\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.composition import PendingNodeInvocation\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.resource_requirement import ensure_requirements_satisfied\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.execution.build_resources import build_resources, wrap_resources_for_execution\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.merger import merge_dicts\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundOpExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        op_config: Any,\n        resources_dict: Mapping[str, Any],\n        resources_config: Mapping[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        mapping_key: Optional[str],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        from dagster._core.execution.api import ephemeral_instance_if_missing\n        from dagster._core.execution.context_creation_job import initialize_console_manager\n\n        self._op_config = op_config\n        self._mapping_key = mapping_key\n\n        self._exit_stack = ExitStack()\n\n        # Construct ephemeral instance if missing\n        self._instance = self._exit_stack.enter_context(ephemeral_instance_if_missing(instance))\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resource_defs = wrap_resources_for_execution(resources_dict)\n        self._resources = self._exit_stack.enter_context(\n            build_resources(\n                resources=self._resource_defs,\n                instance=self._instance,\n                resource_config=resources_config,\n            )\n        )\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        check.invariant(\n            not (partition_key and partition_key_range),\n            "Must supply at most one of partition_key or partition_key_range",\n        )\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n        self._assets_def = check.opt_inst_param(assets_def, "assets_def", AssetsDefinition)\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._exit_stack.close()\n\n    def __del__(self):\n        self._exit_stack.close()\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resource_keys(self) -> AbstractSet[str]:\n        return self._resource_defs.keys()\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_op_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def op(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op_def", "property"))\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("assets_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned run")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self,\n        op_def: OpDefinition,\n        pending_invocation: Optional[PendingNodeInvocation[OpDefinition]],\n        assets_def: Optional[AssetsDefinition],\n        config_from_args: Optional[Mapping[str, Any]],\n        resources_from_args: Optional[Mapping[str, Any]],\n    ) -> "BoundOpExecutionContext":\n        from dagster._core.definitions.resource_invocation import resolve_bound_config\n\n        if resources_from_args:\n            if self._resource_defs:\n                raise DagsterInvalidInvocationError(\n                    "Cannot provide resources in both context and kwargs"\n                )\n            resource_defs = wrap_resources_for_execution(resources_from_args)\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance)\n            )\n        elif assets_def and assets_def.resource_defs:\n            for key in sorted(list(assets_def.resource_defs.keys())):\n                if key in self._resource_defs:\n                    raise DagsterInvalidInvocationError(\n                        f"Error when invoking {assets_def!s} resource '{key}' "\n                        "provided on both the definition and invocation context. Please "\n                        "provide on only one or the other."\n                    )\n            resource_defs = wrap_resources_for_execution(\n                {**self._resource_defs, **assets_def.resource_defs}\n            )\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance, self._resources_config)\n            )\n        else:\n            resources = self.resources\n            resource_defs = self._resource_defs\n\n        _validate_resource_requirements(resource_defs, op_def)\n\n        if self.op_config and config_from_args:\n            raise DagsterInvalidInvocationError("Cannot provide config in both context and kwargs")\n        op_config = resolve_bound_config(config_from_args or self.op_config, op_def)\n\n        return BoundOpExecutionContext(\n            op_def=op_def,\n            op_config=op_config,\n            resources=resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=(\n                pending_invocation.tags\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            hook_defs=(\n                pending_invocation.hook_defs\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            alias=(\n                pending_invocation.given_alias\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n            mapping_key=self._mapping_key,\n            partition_key=self._partition_key,\n            partition_key_range=self._partition_key_range,\n            assets_def=assets_def,\n        )\n\n    def get_events(self) -> Sequence[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n\ndef _validate_resource_requirements(\n    resource_defs: Mapping[str, ResourceDefinition], op_def: OpDefinition\n) -> None:\n    """Validate correctness of resources against required resource keys."""\n    if cast(DecoratedOpFunction, op_def.compute_fn).has_context_arg():\n        for requirement in op_def.get_resource_requirements():\n            if not requirement.is_io_manager_requirement:\n                ensure_requirements_satisfied(resource_defs, [requirement])\n\n\nclass BoundOpExecutionContext(OpExecutionContext):\n    """The op execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific op definition, for which the resources and config have\n    been validated.\n    """\n\n    _op_def: OpDefinition\n    _op_config: Any\n    _resources: "Resources"\n    _resources_config: Mapping[str, Any]\n    _instance: DagsterInstance\n    _log_manager: DagsterLogManager\n    _pdb: Optional[ForkedPdb]\n    _tags: Mapping[str, str]\n    _hook_defs: Optional[AbstractSet[HookDefinition]]\n    _alias: str\n    _user_events: List[UserEvent]\n    _seen_outputs: Dict[str, Union[str, Set[str]]]\n    _output_metadata: Dict[str, Any]\n    _mapping_key: Optional[str]\n    _partition_key: Optional[str]\n    _partition_key_range: Optional[PartitionKeyRange]\n    _assets_def: Optional[AssetsDefinition]\n\n    def __init__(\n        self,\n        op_def: OpDefinition,\n        op_config: Any,\n        resources: "Resources",\n        resources_config: Mapping[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Mapping[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n        mapping_key: Optional[str],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        self._op_def = op_def\n        self._op_config = op_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._op_def.tags, tags) if tags else self._op_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._op_def.name\n        self._resources_config = resources_config\n        self._user_events = user_events\n        self._seen_outputs = {}\n        self._output_metadata = output_metadata\n        self._mapping_key = mapping_key\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._assets_def = assets_def\n        self._requires_typed_event_stream = False\n        self._typed_event_stream_error_message = None\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        run_config: Dict[str, object] = {}\n        if self._op_config:\n            run_config["ops"] = {self._op_def.name: {"config": self._op_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("node_handle", "property"))\n\n    @property\n    def op(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        return self._op_def\n\n    @property\n    def has_assets_def(self) -> bool:\n        return self._assets_def is not None\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        if self._assets_def is None:\n            raise DagsterInvalidPropertyError(\n                f"Op {self.op_def.name} does not have an assets definition."\n            )\n        return self._assets_def\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id,\n            self.log,\n            ScopedResourcesBuilder(resources._asdict()),\n            dagster_type,\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n    def describe_op(self) -> str:\n        if isinstance(self.op_def, OpDefinition):\n            return f'op "{self.op_def.name}"'\n\n        return f'solid "{self.op_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key is not None:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned asset")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n        partitions_def = self.assets_def.partitions_def\n        if partitions_def is None:\n            check.failed("Tried to access partition_key for a non-partitioned asset")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        return cast(\n            Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n        ).time_window_for_partition_key(self.partition_key)\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.mapping_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.op_def.output_defs) == 1:\n            output_def = self.op_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs"\n                " exist. Please provide an output_name to the invocation of"\n                " `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.op_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log output"\n                f" metadata for {output_desc} which has already been yielded. Metadata must be"\n                " logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log metadata"\n                f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n                " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log"\n                    f" metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if output_name not in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    # In this mode no conversion is done on returned values and missing but expected outputs are not\n    # allowed.\n    @property\n    def requires_typed_event_stream(self) -> bool:\n        return self._requires_typed_event_stream\n\n    @property\n    def typed_event_stream_error_message(self) -> Optional[str]:\n        return self._typed_event_stream_error_message\n\n    def set_requires_typed_event_stream(self, *, error_message: Optional[str]) -> None:\n        self._requires_typed_event_stream = True\n        self._typed_event_stream_error_message = error_message\n\n\n
[docs]def build_op_context(\n resources: Optional[Mapping[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n mapping_key: Optional[str] = None,\n _assets_def: Optional[AssetsDefinition] = None,\n) -> UnboundOpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n op_config (Optional[Mapping[str, Any]]): The config to provide to the op.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n mapping_key (Optional[str]): A key representing the mapping key from an upstream dynamic\n output. Can be accessed using ``context.get_mapping_key()``.\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n _assets_def (Optional[AssetsDefinition]): Internal argument that populates the op's assets\n definition, not meant to be populated by users.\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return UnboundOpExecutionContext(\n resources_dict=check.opt_mapping_param(resources, "resources", key_type=str),\n resources_config=check.opt_mapping_param(\n resources_config, "resources_config", key_type=str\n ),\n op_config=op_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n partition_key_range=check.opt_inst_param(\n partition_key_range, "partition_key_range", PartitionKeyRange\n ),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n assets_def=check.opt_inst_param(_assets_def, "_assets_def", AssetsDefinition),\n )
\n\n\n
[docs]def build_asset_context(\n resources: Optional[Mapping[str, Any]] = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n asset_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n):\n """Builds asset execution context from provided parameters.\n\n ``build_asset_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_asset_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking an asset.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n asset_config (Optional[Mapping[str, Any]]): The config to provide to the asset.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n\n Examples:\n .. code-block:: python\n\n context = build_asset_context()\n asset_to_invoke(context)\n\n with build_asset_context(resources={"foo": context_manager_resource}) as context:\n asset_to_invoke(context)\n """\n return build_op_context(\n op_config=asset_config,\n resources=resources,\n resources_config=resources_config,\n partition_key=partition_key,\n partition_key_range=partition_key_range,\n instance=instance,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/invocation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.logger

\nfrom typing import Any, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.LoggerDefinition`.\n\n Users should not instantiate this object directly. To construct an\n `InitLoggerContext` for testing purposes, use :py:func:`dagster.\n build_init_logger_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import logger, InitLoggerContext\n\n @logger\n def hello_world(init_context: InitLoggerContext):\n ...\n\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n job_def: Optional[JobDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._job_def = check.opt_inst_param(job_def, "job_def", JobDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @public\n @property\n def logger_config(self) -> Any:\n """The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`.\n """\n return self._logger_config\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n """The job definition currently being executed."""\n return self._job_def\n\n @public\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n """The logger definition for the logger being constructed."""\n return self._logger_def\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The ID for this run of the job."""\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, job_def: Optional[JobDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, job_def=job_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/_core/execution/context/logger", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.logger"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_layer import AssetOutputInfo\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKey,\n)\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import DagsterInvalidMetadata, DagsterInvariantViolationError\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import JobDefinition, PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.system_config.objects import ResolvedRunConfig\n    from dagster._core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Users should not instantiate this object directly. To construct an\n `OutputContext` for testing an IO Manager's `handle_output` method, use\n :py:func:`dagster.build_output_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, OutputContext\n\n class MyIOManager(IOManager):\n def handle_output(self, context: OutputContext, obj):\n ...\n """\n\n _step_key: Optional[str]\n _name: Optional[str]\n _job_name: Optional[str]\n _run_id: Optional[str]\n _metadata: ArbitraryMetadataMapping\n _user_generated_metadata: Mapping[str, MetadataValue]\n _mapping_key: Optional[str]\n _config: object\n _op_def: Optional["OpDefinition"]\n _dagster_type: Optional["DagsterType"]\n _log: Optional["DagsterLogManager"]\n _version: Optional[str]\n _resource_config: Optional[Mapping[str, object]]\n _step_context: Optional["StepExecutionContext"]\n _asset_info: Optional[AssetOutputInfo]\n _warn_on_step_context_use: bool\n _resources: Optional["Resources"]\n _resources_cm: Optional[ContextManager["Resources"]]\n _resources_contain_cm: Optional[bool]\n _cm_scope_entered: Optional[bool]\n _events: List["DagsterEvent"]\n _user_events: List[Union[AssetMaterialization, AssetObservation]]\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n mapping_key: Optional[str] = None,\n config: object = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Union["Resources", Mapping[str, object]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_info: Optional[AssetOutputInfo] = None,\n warn_on_step_context_use: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._job_name = job_name\n self._run_id = run_id\n self._metadata = metadata or {}\n self._mapping_key = mapping_key\n self._config = config\n self._op_def = op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_info = asset_info\n self._warn_on_step_context_use = warn_on_step_context_use\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events = []\n self._user_events = []\n self._user_generated_metadata = {}\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if (\n hasattr(self, "_resources_cm")\n and self._resources_cm\n and self._resources_contain_cm\n and not self._cm_scope_entered\n ):\n self._resources_cm.__exit__(None, None, None)\n\n @public\n @property\n def step_key(self) -> str:\n """The step_key for the compute step that produced the output."""\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @public\n @property\n def name(self) -> str:\n """The name of the output that produced the output."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run that produced the output."""\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of the metadata that is assigned to the OutputDefinition that produced\n the output.\n """\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> Optional[str]:\n """The key that identifies a unique mapped output. None for regular outputs."""\n return self._mapping_key\n\n @public\n @property\n def config(self) -> Any:\n """The configuration for the output."""\n return self._config\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that produced the output."""\n from dagster._core.definitions import OpDefinition\n\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._op_def)\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this output."""\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this output."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @public\n @property\n def version(self) -> Optional[str]:\n """(Experimental) The version of the output."""\n return self._version\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, object]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the output manager, specified by the `required_resource_keys`\n parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_info(self) -> Optional[AssetOutputInfo]:\n """(Experimental) Asset info corresponding to the output."""\n return self._asset_info\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being stored, otherwise returns False. A return value of False\n indicates that an output from an op is being stored.\n """\n return self._asset_info is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being stored as an output."""\n if self._asset_info is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._asset_info.key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the asset corresponding to this output."""\n asset_key = self.asset_key\n result = self.step_context.job_def.asset_layer.partitions_def_for_asset(asset_key)\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.step_context"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being stored is partitioned."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_asset_partitions"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key_range"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for the output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_keys"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.asset_partitions_def.get_partition_keys_in_range(\n self.step_context.asset_partition_key_range_for_output(self.name),\n dynamic_partitions_store=self.step_context.instance,\n )\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partitions_time_window"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n def get_run_scoped_output_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step"\n f" '{step_key}'. Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n def get_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n\n return self.get_identifier()\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being stored as an output.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])`` materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset output identifier for an output with no asset key")
\n\n def get_asset_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_asset_output_identifier` is deprecated. Use "\n "`OutputContext.get_asset_identifier` instead."\n )\n\n return self.get_asset_identifier()\n\n
[docs] @public\n def log_event(self, event: Union[AssetObservation, AssetMaterialization]) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation]): The event to log.\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster._core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization)):\n if self._step_context:\n self._events.append(DagsterEvent.asset_materialization(self._step_context, event))\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed(f"Unexpected event {event}")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def get_logged_events(\n self,\n ) -> Sequence[Union[AssetMaterialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n return self._user_events\n\n
[docs] @public\n def add_output_metadata(self, metadata: Mapping[str, RawMetadataValue]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Mapping[str, RawMetadataValue]): A metadata dictionary to log\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster._core.definitions.metadata import normalize_metadata\n\n overlapping_labels = set(self._user_generated_metadata.keys()) & metadata.keys()\n if overlapping_labels:\n raise DagsterInvalidMetadata(\n f"Tried to add metadata for key(s) that already have metadata: {overlapping_labels}"\n )\n\n self._user_generated_metadata = {\n **self._user_generated_metadata,\n **normalize_metadata(metadata),\n }
\n\n def get_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Get the mapping of metadata entries that have been logged for use with this output."""\n return self._user_generated_metadata\n\n def consume_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata has not yet been called, this will yield all logged events since\n the call to `handle_output`. If consume_logged_metadata has been called, it will yield all\n events since the last time consume_logged_metadata_entries was called. Designed for internal\n use. Users should never need to invoke this method.\n """\n result = self._user_generated_metadata\n self._user_generated_metadata = {}\n return result or {}
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n job_def: "JobDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n warn_on_step_context_use: bool = False,\n) -> "OutputContext":\n """Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n op_config = resolved_run_config.ops[step.node_handle.to_string()]\n outputs_config = op_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = job_def.get_node(step_output.node_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n node_handle = execution_plan.get_step_by_key(step.key).node_handle\n asset_info = job_def.asset_layer.asset_info_for_output(\n node_handle=node_handle, output_name=step_output.name\n )\n if asset_info is not None:\n metadata = job_def.asset_layer.metadata_for_asset(asset_info.key) or output_def.metadata\n else:\n metadata = output_def.metadata\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n job_name=job_def.name,\n run_id=run_id,\n metadata=metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n op_def=job_def.get_node(step.node_handle).definition, # type: ignore # (should be OpDefinition not NodeDefinition)\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n asset_info=asset_info,\n warn_on_step_context_use=warn_on_step_context_use,\n )\n\n\ndef step_output_version(\n job_def: "JobDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster._core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n job_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Mapping[str, object]] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Mapping[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Mapping[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_key: Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\n output.\n partition_key: Optional[str]: String value representing partition key to execute with.\n\n Examples:\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster._core.definitions import OpDefinition\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n\n return OutputContext(\n step_key=step_key,\n name=name,\n job_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n asset_info=AssetOutputInfo(key=asset_key) if asset_key else None,\n partition_key=partition_key,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.output"}, "system": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.system

\n"""This module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module.\n"""\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom hashlib import sha256\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD,\n    extract_data_version_from_entry,\n)\nfrom dagster._core.definitions.dependency import OpNode\nfrom dagster._core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.partition_mapping import (\n    PartitionMapping,\n    infer_partition_mapping,\n)\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    PARTITION_NAME_TAG,\n)\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.types.dagster_type import DagsterType\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.data_version import (\n        DataVersion,\n    )\n    from dagster._core.definitions.dependency import NodeHandle\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.event_api import EventLogRecord\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.state import KnownExecutionState\n    from dagster._core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\ndef is_iterable(obj: Any) -> bool:\n    try:\n        iter(obj)\n    except:\n        return False\n    return True\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def job(self) -> IJob:\n        return self.plan_data.job\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        return self.plan_data.dagster_run\n\n    @property\n    def run_id(self) -> str:\n        return self.dagster_run.run_id\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        return self.dagster_run.run_config\n\n    @property\n    def job_name(self) -> str:\n        return self.dagster_run.job_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self) -> "ExecutionPlan":\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Mapping[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.all_tags()\n\n    @property\n    def event_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.event_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.dagster_run.tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.dagster_run.tags.get(key)\n\n    @property\n    def run_tags(self) -> Mapping[str, str]:\n        return self.dagster_run.tags\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    job: IJob\n    dagster_run: DagsterRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    job_def: JobDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def node_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_job(self) -> ReconstructableJob:\n        if not isinstance(self.job, ReconstructableJob):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructableJob"\n            )\n        return self.job\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n    ):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def node_handle(self) -> "NodeHandle":\n        return self.step.node_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(\n        self,\n        step: ExecutionStep,\n        known_state: Optional["KnownExecutionState"] = None,\n    ) -> IStepContext:\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            known_state=known_state,\n        )\n\n    @property\n    def job_def(self) -> JobDefinition:\n        return self._execution_data.job_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partitions_def(self) -> Optional[PartitionsDefinition]:\n        from dagster._core.definitions.job_definition import JobDefinition\n\n        job_def = self._execution_data.job_def\n        if not isinstance(job_def, JobDefinition):\n            check.failed(\n                "Can only call 'partitions_def', when using jobs, not legacy pipelines",\n            )\n        partitions_def = job_def.partitions_def\n        return partitions_def\n\n    @property\n    def has_partitions(self) -> bool:\n        tags = self._plan_data.dagster_run.tags\n        return bool(\n            PARTITION_NAME_TAG in tags\n            or any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()])\n            or (\n                tags.get(ASSET_PARTITION_RANGE_START_TAG)\n                and tags.get(ASSET_PARTITION_RANGE_END_TAG)\n            )\n        )\n\n    @property\n    def partition_key(self) -> str:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            return get_multipartition_key_from_tags(tags)\n        elif PARTITION_NAME_TAG in tags:\n            return tags[PARTITION_NAME_TAG]\n        else:\n            range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            range_end = tags[ASSET_PARTITION_RANGE_END_TAG]\n\n            if range_start != range_end:\n                raise DagsterInvariantViolationError(\n                    "Cannot access partition_key for a partitioned run with a range of partitions."\n                    " Call partition_key_range instead."\n                )\n            else:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return self.partitions_def.get_partition_key_from_str(cast(str, range_start))\n                return cast(str, range_start)\n\n    @property\n    def asset_partition_key_range(self) -> PartitionKeyRange:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            multipartition_key = get_multipartition_key_from_tags(tags)\n            return PartitionKeyRange(multipartition_key, multipartition_key)\n        elif PARTITION_NAME_TAG in tags:\n            partition_key = tags[PARTITION_NAME_TAG]\n            return PartitionKeyRange(partition_key, partition_key)\n        else:\n            partition_key_range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            if partition_key_range_start is not None:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return PartitionKeyRange(\n                        self.partitions_def.get_partition_key_from_str(partition_key_range_start),\n                        self.partitions_def.get_partition_key_from_str(\n                            tags[ASSET_PARTITION_RANGE_END_TAG]\n                        ),\n                    )\n            return PartitionKeyRange(partition_key_range_start, tags[ASSET_PARTITION_RANGE_END_TAG])\n\n    @property\n    def partition_time_window(self) -> TimeWindow:\n        partitions_def = self.partitions_def\n\n        if partitions_def is None:\n            raise DagsterInvariantViolationError("Partitions definition is not defined")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        if self.has_partition_key:\n            return cast(\n                Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n            ).time_window_for_partition_key(self.partition_key)\n        elif self.has_partition_key_range:\n            partition_key_range = self.asset_partition_key_range\n            partitions_def = cast(\n                Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n            )\n            return TimeWindow(\n                partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n                partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n            )\n\n        else:\n            check.failed(\n                "Has a PartitionsDefinition, so should either have a partition key or a partition"\n                " key range"\n            )\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.dagster_run.tags\n\n    @property\n    def has_partition_key_range(self) -> bool:\n        return ASSET_PARTITION_RANGE_START_TAG in self._plan_data.dagster_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\n@dataclass\nclass InputAssetVersionInfo:\n    # This is the storage id of the last materialization of any partition of an asset. Thus it is\n    # computed the same way for both partitioned and non-partitioned assets.\n    storage_id: int\n\n    # If the input asset is partitioned, this is a hash of the sorted data versions of each dependency\n    # partition. If the input asset is not partitioned, this is the data version of the asset. It\n    # can be none if we are sourcing a materialization from before data versions.\n    data_version: Optional["DataVersion"]\n\n    # This is the run_id on the event that the storage_id references\n    run_id: str\n\n    # This is the timestamp on the event that the storage_id references\n    timestamp: float\n\n\n
[docs]class StepExecutionContext(PlanExecutionContext, IStepContext):\n """Context for the execution of a step. Users should not instantiate this class directly.\n\n This context assumes that user code can be run directly, and thus includes resource and information.\n """\n\n def __init__(\n self,\n plan_data: PlanData,\n execution_data: ExecutionData,\n log_manager: DagsterLogManager,\n step: ExecutionStep,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.resources_init import get_required_resource_keys_for_step\n\n super(StepExecutionContext, self).__init__(\n plan_data=plan_data,\n execution_data=execution_data,\n log_manager=log_manager,\n output_capture=output_capture,\n )\n self._step = step\n self._required_resource_keys = get_required_resource_keys_for_step(\n plan_data.job.get_definition(),\n step,\n plan_data.execution_plan,\n )\n self._resources = execution_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n self._known_state = known_state\n self._input_lineage: List[AssetLineageInfo] = []\n\n resources_iter = cast(Iterable, self._resources)\n\n step_launcher_resources = [\n resource for resource in resources_iter if isinstance(resource, StepLauncher)\n ]\n\n self._step_launcher: Optional[StepLauncher] = None\n if len(step_launcher_resources) > 1:\n raise DagsterInvariantViolationError(\n "Multiple required resources for {described_op} have inherited StepLauncher"\n "There should be at most one step launcher resource per {node_type}.".format(\n described_op=self.describe_op(), node_type=self.op_def.node_type_str\n )\n )\n elif len(step_launcher_resources) == 1:\n self._step_launcher = step_launcher_resources[0]\n\n self._step_exception: Optional[BaseException] = None\n\n self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n # Enable step output capture if there are any hooks which will receive them.\n # Expect in the future that hooks may control whether or not they get outputs,\n # but for now presence of any will cause output capture.\n if self.job_def.get_all_hooks_for_handle(self.node_handle):\n self._step_output_capture = {}\n\n self._output_metadata: Dict[str, Any] = {}\n self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n self._input_asset_version_info: Dict[AssetKey, Optional["InputAssetVersionInfo"]] = {}\n self._is_external_input_asset_version_info_loaded = False\n self._data_version_cache: Dict[AssetKey, "DataVersion"] = {}\n\n self._requires_typed_event_stream = False\n self._typed_event_stream_error_message = None\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._typed_event_stream_error_message\n\n # Error message will be appended to the default error message.\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None):\n self._requires_typed_event_stream = True\n self._typed_event_stream_error_message = error_message\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def node_handle(self) -> "NodeHandle":\n return self.step.node_handle\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n return self._step_launcher\n\n @property\n def op_def(self) -> OpDefinition:\n return self.op.definition\n\n @property\n def job_def(self) -> "JobDefinition":\n return self._execution_data.job_def\n\n @property\n def op(self) -> OpNode:\n return self.job_def.get_op(self._step.node_handle)\n\n @property\n def op_retry_policy(self) -> Optional[RetryPolicy]:\n return self.job_def.get_retry_policy_for_handle(self.node_handle)\n\n def describe_op(self) -> str:\n return f'op "{self.node_handle}"'\n\n def get_io_manager(self, step_output_handle: StepOutputHandle) -> IOManager:\n step_output = self.execution_plan.get_step_output(step_output_handle)\n io_manager_key = (\n self.job_def.get_node(step_output.node_handle)\n .output_def_named(step_output.name)\n .io_manager_key\n )\n\n output_manager = getattr(self.resources, io_manager_key)\n return check.inst(output_manager, IOManager)\n\n def get_output_context(self, step_output_handle: StepOutputHandle) -> OutputContext:\n return get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n step_output_handle,\n self._get_source_run_id(step_output_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n )\n\n def for_input_manager(\n self,\n name: str,\n config: Any,\n metadata: Any,\n dagster_type: DagsterType,\n source_handle: Optional[StepOutputHandle] = None,\n resource_config: Any = None,\n resources: Optional["Resources"] = None,\n artificial_output_context: Optional["OutputContext"] = None,\n ) -> InputContext:\n if source_handle and artificial_output_context:\n check.failed("Cannot specify both source_handle and artificial_output_context.")\n\n upstream_output: Optional[OutputContext] = None\n\n if source_handle is not None:\n version = self.execution_plan.get_version_for_step_output_handle(source_handle)\n\n # NOTE: this is using downstream step_context for upstream OutputContext. step_context\n # will be set to None for 0.15 release.\n upstream_output = get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n source_handle,\n self._get_source_run_id(source_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=version,\n warn_on_step_context_use=True,\n )\n else:\n upstream_output = artificial_output_context\n\n asset_key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.node_handle, input_name=name\n )\n asset_partitions_subset = (\n self.asset_partitions_subset_for_input(name)\n if self.has_asset_partitions_for_input(name)\n else None\n )\n\n asset_partitions_def = (\n self.job_def.asset_layer.partitions_def_for_asset(asset_key) if asset_key else None\n )\n return InputContext(\n job_name=self.job_def.name,\n name=name,\n op_def=self.op_def,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=self.log,\n step_context=self,\n resource_config=resource_config,\n resources=resources,\n asset_key=asset_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=self.instance,\n )\n\n def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n from .hook import HookContext\n\n return HookContext(self, hook_def)\n\n def get_known_state(self) -> "KnownExecutionState":\n if not self._known_state:\n check.failed(\n "Attempted to access KnownExecutionState but it was not provided at context"\n " creation"\n )\n return self._known_state\n\n def can_load(\n self,\n step_output_handle: StepOutputHandle,\n ) -> bool:\n # can load from upstream in the same run\n if step_output_handle in self.get_known_state().ready_outputs:\n return True\n\n if (\n self._should_load_from_previous_runs(step_output_handle)\n # should and can load from a previous run\n and self._get_source_run_id_from_logs(step_output_handle)\n ):\n return True\n\n return False\n\n def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n if mapping_key:\n if output_name not in self._seen_outputs:\n self._seen_outputs[output_name] = set()\n cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n else:\n self._seen_outputs[output_name] = "seen"\n\n def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n if mapping_key:\n return (\n output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n )\n return output_name in self._seen_outputs\n\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n if output_name is None and len(self.op_def.output_defs) == 1:\n output_def = self.op_def.output_defs[0]\n output_name = output_def.name\n elif output_name is None:\n raise DagsterInvariantViolationError(\n "Attempted to log metadata without providing output_name, but multiple outputs"\n " exist. Please provide an output_name to the invocation of"\n " `context.add_output_metadata`."\n )\n else:\n output_def = self.op_def.output_def_named(output_name)\n\n if self.has_seen_output(output_name, mapping_key):\n output_desc = (\n f"output '{output_def.name}'"\n if not mapping_key\n else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n )\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log output"\n f" metadata for {output_desc} which has already been yielded. Metadata must be"\n " logged before the output is yielded."\n )\n if output_def.is_dynamic and not mapping_key:\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log metadata"\n f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n )\n\n if mapping_key:\n if output_name not in self._output_metadata:\n self._output_metadata[output_name] = {}\n if mapping_key in self._output_metadata[output_name]:\n self._output_metadata[output_name][mapping_key].update(metadata)\n else:\n self._output_metadata[output_name][mapping_key] = metadata\n else:\n if output_name in self._output_metadata:\n self._output_metadata[output_name].update(metadata)\n else:\n self._output_metadata[output_name] = metadata\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n metadata = self._output_metadata.get(output_name)\n if mapping_key and metadata:\n return metadata.get(mapping_key)\n return metadata\n\n def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n # walk through event logs to find the right run_id based on the run lineage\n\n parent_state = self.get_known_state().parent_state\n while parent_state:\n # if the parent run has yielded an StepOutput event for the given step output,\n # we find the source run id\n if step_output_handle in parent_state.produced_outputs:\n return parent_state.run_id\n\n # else, keep looking backwards\n parent_state = parent_state.get_parent_state()\n\n # When a fixed path is provided via io manager, it's able to run step subset using an execution\n # plan when the ascendant outputs were not previously created by dagster-controlled\n # computations. for example, in backfills, with fixed path io manager, we allow users to\n # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n # Warn about this special case because it will also reach here when all previous runs have\n # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n self.log.warning(\n f"No previously stored outputs found for source {step_output_handle}. "\n "This is either because you are using an IO Manager that does not depend on run ID, "\n "or because all the previous runs have skipped the output in conditional execution."\n )\n return None\n\n def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n # should not load if not a re-execution\n if self.dagster_run.parent_run_id is None:\n return False\n # should not load if re-executing the entire pipeline\n if self.dagster_run.step_keys_to_execute is None:\n return False\n\n # should not load if the entire dynamic step is being executed in the current run\n handle = StepHandle.parse_from_key(step_output_handle.step_key)\n if (\n isinstance(handle, ResolvedFromDynamicStepHandle)\n and handle.unresolved_form.to_key() in self.dagster_run.step_keys_to_execute\n ):\n return False\n\n # should not load if this step is being executed in the current run\n return step_output_handle.step_key not in self.dagster_run.step_keys_to_execute\n\n def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n if self._should_load_from_previous_runs(step_output_handle):\n return self._get_source_run_id_from_logs(step_output_handle)\n else:\n return self.dagster_run.run_id\n\n def capture_step_exception(self, exception: BaseException):\n self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n @property\n def step_exception(self) -> Optional[BaseException]:\n return self._step_exception\n\n @property\n def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n return self._step_output_capture\n\n @property\n def previous_attempt_count(self) -> int:\n return self.get_known_state().get_retry_state().get_attempt_count(self._step.key)\n\n @property\n def op_config(self) -> Any:\n op_config = self.resolved_run_config.ops.get(str(self.node_handle))\n return op_config.config if op_config else None\n\n @property\n def is_op_in_graph(self) -> bool:\n """Whether this step corresponds to an op within a graph (either @graph, or @graph_asset)."""\n return self.step.node_handle.parent is not None\n\n @property\n def is_sda_step(self) -> bool:\n """Whether this step corresponds to a software define asset, inferred by presence of asset info on outputs.\n\n note: ops can materialize assets as well.\n """\n for output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output.name\n )\n if asset_info is not None:\n return True\n return False\n\n def set_data_version(self, asset_key: AssetKey, data_version: "DataVersion") -> None:\n self._data_version_cache[asset_key] = data_version\n\n def has_data_version(self, asset_key: AssetKey) -> bool:\n return asset_key in self._data_version_cache\n\n def get_data_version(self, asset_key: AssetKey) -> "DataVersion":\n return self._data_version_cache[asset_key]\n\n @property\n def input_asset_records(self) -> Optional[Mapping[AssetKey, Optional["InputAssetVersionInfo"]]]:\n return self._input_asset_version_info\n\n @property\n def is_external_input_asset_version_info_loaded(self) -> bool:\n return self._is_external_input_asset_version_info_loaded\n\n def get_input_asset_version_info(self, key: AssetKey) -> Optional["InputAssetVersionInfo"]:\n if key not in self._input_asset_version_info:\n self._fetch_input_asset_version_info(key)\n return self._input_asset_version_info[key]\n\n # "external" refers to records for inputs generated outside of this step\n def fetch_external_input_asset_version_info(self) -> None:\n output_keys = self.get_output_asset_keys()\n\n all_dep_keys: List[AssetKey] = []\n for output_key in output_keys:\n if output_key not in self.job_def.asset_layer.asset_deps:\n continue\n dep_keys = self.job_def.asset_layer.upstream_assets_for_asset(output_key)\n for key in dep_keys:\n if key not in all_dep_keys and key not in output_keys:\n all_dep_keys.append(key)\n\n self._input_asset_version_info = {}\n for key in all_dep_keys:\n self._fetch_input_asset_version_info(key)\n self._is_external_input_asset_version_info_loaded = True\n\n def _fetch_input_asset_version_info(self, key: AssetKey) -> None:\n from dagster._core.definitions.data_version import (\n extract_data_version_from_entry,\n )\n\n event = self._get_input_asset_event(key)\n if event is None:\n self._input_asset_version_info[key] = None\n else:\n storage_id = event.storage_id\n # Input name will be none if this is an internal dep\n input_name = self.job_def.asset_layer.input_for_asset_key(self.node_handle, key)\n # Exclude AllPartitionMapping for now to avoid huge queries\n if input_name and self.has_asset_partitions_for_input(input_name):\n subset = self.asset_partitions_subset_for_input(\n input_name, require_valid_partitions=False\n )\n input_keys = list(subset.get_partition_keys())\n\n # This check represents a temporary constraint that prevents huge query results for upstream\n # partition data versions from timing out runs. If a partitioned dependency (a) uses an\n # AllPartitionMapping; and (b) has greater than or equal to\n # SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD dependency partitions, then we\n # process it as a non-partitioned dependency (note that this was the behavior for\n # all partition dependencies prior to 2023-08). This means that stale status\n # results cannot be accurately computed for the dependency, and there is thus\n # corresponding logic in the CachingStaleStatusResolver to account for this. This\n # constraint should be removed when we have thoroughly examined the performance of\n # the data version retrieval query and can guarantee decent performance.\n if len(input_keys) < SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD:\n data_version = self._get_partitions_data_version_from_keys(key, input_keys)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n self._input_asset_version_info[key] = InputAssetVersionInfo(\n storage_id, data_version, event.run_id, event.timestamp\n )\n\n def partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n if upstream_asset_key:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n partitions_def = assets_def.partitions_def if assets_def else None\n explicit_partition_mapping = self.job_def.asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n )\n return infer_partition_mapping(\n explicit_partition_mapping,\n partitions_def,\n upstream_asset_partitions_def,\n )\n else:\n return None\n\n def _get_input_asset_event(self, key: AssetKey) -> Optional["EventLogRecord"]:\n event = self.instance.get_latest_data_version_record(key)\n if event:\n self._check_input_asset_event(key, event)\n return event\n\n def _check_input_asset_event(self, key: AssetKey, event: "EventLogRecord") -> None:\n assert event.event_log_entry\n event_data_version = extract_data_version_from_entry(event.event_log_entry)\n if key in self._data_version_cache and self._data_version_cache[key] != event_data_version:\n self.log.warning(\n f"Data version mismatch for asset {key}. Data version from materialization within"\n f" current step is `{self._data_version_cache[key]}`. Data version from most recent"\n f" materialization is `{event_data_version}`. Most recent materialization will be"\n " used for provenance tracking."\n )\n\n def _get_partitions_data_version_from_keys(\n self, key: AssetKey, partition_keys: Sequence[str]\n ) -> "DataVersion":\n from dagster._core.definitions.data_version import (\n DataVersion,\n )\n from dagster._core.events import DagsterEventType\n\n # TODO: this needs to account for observations also\n event_type = DagsterEventType.ASSET_MATERIALIZATION\n tags_by_partition = (\n self.instance._event_storage.get_latest_tags_by_partition( # noqa: SLF001\n key, event_type, [DATA_VERSION_TAG], asset_partitions=list(partition_keys)\n )\n )\n partition_data_versions = [\n pair[1][DATA_VERSION_TAG]\n for pair in sorted(tags_by_partition.items(), key=lambda x: x[0])\n ]\n hash_sig = sha256()\n hash_sig.update(bytearray("".join(partition_data_versions), "utf8"))\n return DataVersion(hash_sig.hexdigest())\n\n # Call this to clear the cache for an input asset record. This is necessary when an old\n # materialization for an asset was loaded during `fetch_external_input_asset_records` because an\n # intrastep asset is not required, but then that asset is materialized during the step. If we\n # don't clear the cache for this asset, then we won't use the most up-to-date asset record.\n def wipe_input_asset_version_info(self, key: AssetKey) -> None:\n if key in self._input_asset_version_info:\n del self._input_asset_version_info[key]\n\n def get_output_asset_keys(self) -> AbstractSet[AssetKey]:\n output_keys: Set[AssetKey] = set()\n for step_output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, step_output.name\n )\n if asset_info is None or not asset_info.is_required:\n continue\n output_keys.add(asset_info.key)\n return output_keys\n\n def has_asset_partitions_for_input(self, input_name: str) -> bool:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n return (\n upstream_asset_key is not None\n and asset_layer.partitions_def_for_asset(upstream_asset_key) is not None\n )\n\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n subset = self.asset_partitions_subset_for_input(input_name)\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset partition key range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n def asset_partitions_subset_for_input(\n self, input_name: str, *, require_valid_partitions: bool = True\n ) -> PartitionsSubset:\n asset_layer = self.job_def.asset_layer\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is not None:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if upstream_asset_partitions_def is not None:\n partitions_def = assets_def.partitions_def if assets_def else None\n partitions_subset = (\n partitions_def.empty_subset().with_partition_key_range(\n self.asset_partition_key_range, dynamic_partitions_store=self.instance\n )\n if partitions_def\n else None\n )\n partition_mapping = infer_partition_mapping(\n asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n ),\n partitions_def,\n upstream_asset_partitions_def,\n )\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n partitions_subset,\n upstream_asset_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n if (\n require_valid_partitions\n and mapped_partitions_result.required_but_nonexistent_partition_keys\n ):\n raise DagsterInvariantViolationError(\n f"Partition key range {self.asset_partition_key_range} in"\n f" {self.node_handle.name} depends on invalid partition keys"\n f" {mapped_partitions_result.required_but_nonexistent_partition_keys} in"\n f" upstream asset {upstream_asset_key}"\n )\n\n return mapped_partitions_result.partitions_subset\n\n check.failed("The input has no asset partitions")\n\n def asset_partition_key_for_input(self, input_name: str) -> str:\n start, end = self.asset_partition_key_range_for_input(input_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for input '{input_name}' of step '{self.step.key}',"\n f" but the step input has a partition range: '{start}' to '{end}'."\n )\n\n def _partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.node_handle, output_name=output_name\n )\n if asset_info:\n return asset_info.partitions_def\n else:\n return None\n\n def partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n return self._partitions_def_for_output(output_name)\n\n def has_asset_partitions_for_output(self, output_name: str) -> bool:\n return self._partitions_def_for_output(output_name) is not None\n\n def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n if self._partitions_def_for_output(output_name) is not None:\n return self.asset_partition_key_range\n\n check.failed("The output has no asset partitions")\n\n def asset_partition_key_for_output(self, output_name: str) -> str:\n start, end = self.asset_partition_key_range_for_output(output_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for output '{output_name}' of step"\n f" '{self.step.key}', but the step output has a partition range: '{start}' to"\n f" '{end}'."\n )\n\n def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given output.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n partitions_def = self._partitions_def_for_output(output_name)\n\n if not partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an output that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n )\n partition_key_range = self.asset_partition_key_range_for_output(output_name)\n return TimeWindow(\n # mypy thinks partitions_def is <nothing> here because ????\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n def asset_partitions_time_window_for_input(self, input_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given input.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is None:\n raise ValueError("The input has no corresponding asset")\n\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if not upstream_asset_partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an input that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(upstream_asset_partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n upstream_asset_partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition],\n upstream_asset_partitions_def,\n )\n partition_key_range = self.asset_partition_key_range_for_input(input_name)\n\n return TimeWindow(\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.start\n ).start,\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.end\n ).end,\n )\n\n def get_type_loader_context(self) -> "DagsterTypeLoaderContext":\n return DagsterTypeLoaderContext(\n plan_data=self.plan_data,\n execution_data=self._execution_data,\n log_manager=self._log_manager,\n step=self.step,\n output_capture=self._output_capture,\n known_state=self._known_state,\n )\n\n def output_observes_source_asset(self, output_name: str) -> bool:\n """Returns True if this step observes a source asset."""\n asset_layer = self.job_def.asset_layer\n if asset_layer is None:\n return False\n asset_key = asset_layer.asset_key_for_output(self.node_handle, output_name)\n if asset_key is None:\n return False\n return asset_layer.is_observable_for_asset(asset_key)
\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType."""\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @public\n @property\n def resources(self) -> "Resources":\n """An object whose attributes contain the resources available to this op."""\n return self._resources\n\n @public\n @property\n def run_id(self) -> str:\n """The id of this job run."""\n return self._run_id\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._log
\n\n\n
[docs]class DagsterTypeLoaderContext(StepExecutionContext):\n """The context object provided to a :py:class:`@dagster_type_loader <dagster_type_loader>`-decorated function during execution.\n\n Users should not construct this object directly.\n """\n\n @public\n @property\n def resources(self) -> "Resources":\n """The resources available to the type loader, specified by the `required_resource_keys` argument of the decorator."""\n return super(DagsterTypeLoaderContext, self).resources\n\n @public\n @property\n def job_def(self) -> "JobDefinition":\n """The underlying job definition being executed."""\n return super(DagsterTypeLoaderContext, self).job_def\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The op for which type loading is occurring."""\n return super(DagsterTypeLoaderContext, self).op_def
\n
", "current_page_name": "_modules/dagster/_core/execution/context/system", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.execute_in_process_result

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class ExecuteInProcessResult(ExecutionResult):\n """Result object returned by in-process testing APIs.\n\n Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.\n\n This object is returned by:\n - :py:meth:`dagster.GraphDefinition.execute_in_process`\n - :py:meth:`dagster.JobDefinition.execute_in_process`\n - :py:meth:`dagster.materialize_to_memory`\n - :py:meth:`dagster.materialize`\n """\n\n _handle: NodeHandle\n _event_list: Sequence[DagsterEvent]\n _dagster_run: DagsterRun\n _output_capture: Mapping[StepOutputHandle, Any]\n _job_def: JobDefinition\n\n def __init__(\n self,\n event_list: Sequence[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Mapping[StepOutputHandle, Any]],\n job_def: JobDefinition,\n ):\n self._job_def = job_def\n\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_mapping_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run ID of the executed :py:class:`DagsterRun`."""\n return self.dagster_run.run_id\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n mapped_outputs = {}\n step_key = str(handle)\n output_found = False\n for step_output_handle, value in self._output_capture.items():\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if (\n step_output_handle.step_key.startswith(f"{step_key}[")\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return self._output_capture[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(\n f"No outputs found for output '{output_name}' from node '{handle}'."\n )\n return mapped_outputs\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_for_node(\n node_str, output_name=output_name\n )
\n\n
[docs] @public\n def asset_value(self, asset_key: CoercibleToAssetKey) -> Any:\n """Retrieves the value of an asset that was materialized during the execution of the job.\n\n Args:\n asset_key (CoercibleToAssetKey): The key of the asset to retrieve.\n\n Returns:\n Any: The value of the retrieved asset.\n """\n node_output_handle = self._job_def.asset_layer.node_output_handle_for_asset(\n AssetKey.from_coercible(asset_key)\n )\n return self.output_for_node(\n node_str=str(node_output_handle.node_handle), output_name=node_output_handle.output_name\n )
\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_value(output_name=output_name)
\n
", "current_page_name": "_modules/dagster/_core/execution/execute_in_process_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.execute_in_process_result"}, "job_execution_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.job_execution_result

\nfrom typing import Any, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class JobExecutionResult(ExecutionResult):\n """Result object returned by :py:func:`dagster.execute_job`.\n\n Used for retrieving run success, events, and outputs from `execute_job`.\n Users should not directly instantiate this class.\n\n Events and run information can be retrieved off of the object directly. In\n order to access outputs, the `ExecuteJobResult` object needs to be opened\n as a context manager, which will re-initialize the resources from\n execution.\n """\n\n def __init__(self, job_def, reconstruct_context, event_list, dagster_run):\n self._job_def = job_def\n self._reconstruct_context = reconstruct_context\n self._context = None\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n def __enter__(self) -> "JobExecutionResult":\n context = self._reconstruct_context.__enter__()\n self._context = context\n return self\n\n def __exit__(self, *exc):\n exit_result = self._reconstruct_context.__exit__(*exc)\n self._context = None\n return exit_result\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """Sequence[DagsterEvent]: List of all events yielded by the job execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the Dagster run that was executed."""\n return self.dagster_run.run_id\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`. If the top-level job has no output, calling this method will also result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_value(output_name=output_name)
\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the run of the job.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_for_node(node_str, output_name=output_name)
\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n if not self._context:\n raise DagsterInvariantViolationError(\n "In order to access output objects, the result of `execute_job` must be opened as a"\n " context manager: 'with execute_job(...) as result:"\n )\n found = False\n result = None\n for compute_step_event in self.compute_events_for_handle(handle):\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = self._context.execution_plan.get_step_by_key(compute_step_event.step_key)\n dagster_type = (\n self.job_def.get_node(handle).output_def_named(output_name).dagster_type\n )\n value = self._get_value(self._context.for_step(step), output, dagster_type)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = (\n value # pylint:disable=unsupported-assignment-operation\n )\n else:\n result = value\n\n if found:\n return result\n\n node = self.job_def.get_node(handle)\n raise DagsterInvariantViolationError(\n f"Did not find result {output_name} in {node.describe_node()}"\n )\n\n def _get_value(self, context, step_output_data, dagster_type):\n step_output_handle = step_output_data.step_output_handle\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.job_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res
\n
", "current_page_name": "_modules/dagster/_core/execution/job_execution_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.job_execution_result"}, "validate_run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.validate_run_config

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions import JobDefinition\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: JobDefinition,\n run_config: Optional[Union[Mapping[str, Any], RunConfig]] = None,\n) -> Mapping[str, Any]:\n """Function to validate a provided run config blob against a given job.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (JobDefinition): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n check.inst_param(job_def, "job_def", JobDefinition)\n run_config = check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n )\n\n return ResolvedRunConfig.build(job_def, run_config).to_dict()
\n
", "current_page_name": "_modules/dagster/_core/execution/validate_run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.validate_run_config"}, "with_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.with_resources

\nfrom typing import Any, Iterable, List, Mapping, Optional, Sequence, TypeVar, cast\n\nfrom dagster import _check as check\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..._config import Shape\nfrom ..definitions.resource_requirement import ResourceAddable\nfrom ..definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom ..errors import DagsterInvalidConfigError, DagsterInvalidInvocationError\n\nT = TypeVar("T", bound=ResourceAddable)\n\n\n
[docs]def with_resources(\n definitions: Iterable[T],\n resource_defs: Mapping[str, object],\n resource_config_by_key: Optional[Mapping[str, Any]] = None,\n) -> Sequence[T]:\n """Adds dagster resources to copies of resource-requiring dagster definitions.\n\n An error will be thrown if any provided definitions have a conflicting\n resource definition provided for a key provided to resource_defs. Resource\n config can be provided, with keys in the config dictionary corresponding to\n the keys for each resource definition. If any definition has unsatisfied\n resource keys after applying with_resources, an error will be thrown.\n\n Args:\n definitions (Iterable[ResourceAddable]): Dagster definitions to provide resources to.\n resource_defs (Mapping[str, object]):\n Mapping of resource keys to objects to satisfy\n resource requirements of provided dagster definitions.\n resource_config_by_key (Optional[Mapping[str, Any]]):\n Specifies config for provided resources. The key in this dictionary\n corresponds to configuring the same key in the resource_defs\n dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset, resource, with_resources\n\n @resource(config_schema={"bar": str})\n def foo_resource():\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset1(context):\n foo = context.resources.foo\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset2(context):\n foo = context.resources.foo\n ...\n\n asset1_with_foo, asset2_with_foo = with_resources(\n [the_asset, other_asset],\n resource_config_by_key={\n "foo": {\n "config": {"bar": ...}\n }\n }\n )\n """\n from dagster._config import validate_config\n from dagster._core.definitions.job_definition import (\n default_job_io_manager_with_fs_io_manager_schema,\n )\n\n check.mapping_param(resource_defs, "resource_defs")\n resource_config_by_key = check.opt_mapping_param(\n resource_config_by_key, "resource_config_by_key"\n )\n\n resource_defs = wrap_resources_for_execution(\n merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n resource_defs,\n )\n )\n\n for key, resource_def in resource_defs.items():\n if key in resource_config_by_key:\n resource_config = resource_config_by_key[key]\n if not isinstance(resource_config, dict) or "config" not in resource_config:\n raise DagsterInvalidInvocationError(\n f"Error with config for resource key '{key}': Expected a "\n "dictionary of the form {'config': ...}, but received "\n f"{resource_config}"\n )\n\n outer_config_shape = Shape({"config": resource_def.get_config_field()})\n config_evr = validate_config(outer_config_shape, resource_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error when applying config for resource with key '{key}' ",\n config_evr.errors,\n resource_config,\n )\n resource_defs[key] = resource_defs[key].configured(resource_config["config"])\n\n transformed_defs: List[T] = []\n for definition in definitions:\n transformed_defs.append(cast(T, definition.with_resources(resource_defs)))\n\n return transformed_defs
\n
", "current_page_name": "_modules/dagster/_core/execution/with_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.with_resources"}}, "executor": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator\n\nfrom dagster._annotations import public\nfrom dagster._core.execution.retries import RetryMode\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import PlanOrchestrationContext\n    from dagster._core.execution.plan.plan import ExecutionPlan\n\n\n
[docs]class Executor(ABC):\n
[docs] @public\n @abstractmethod\n def execute(\n self, plan_context: "PlanOrchestrationContext", execution_plan: "ExecutionPlan"\n ) -> Iterator["DagsterEvent"]:\n """For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @public\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/_core/executor/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.base"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.init

\nfrom typing import Mapping, NamedTuple\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions import ExecutorDefinition, IJob\nfrom dagster._core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", PublicAttr[IJob]),\n ("executor_def", PublicAttr[ExecutorDefinition]),\n ("executor_config", PublicAttr[Mapping[str, object]]),\n ("instance", PublicAttr[DagsterInstance]),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IJob): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IJob,\n executor_def: ExecutorDefinition,\n executor_config: Mapping[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IJob),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.mapping_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/executor/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.init"}}, "instance": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance

\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport weakref\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom types import TracebackType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom typing_extensions import Protocol, Self, TypeAlias, TypeVar, runtime_checkable\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.data_version import extract_data_provenance_from_entry\nfrom dagster._core.definitions.events import AssetKey, AssetObservation\nfrom dagster._core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster._core.log_manager import DagsterLogRecord\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    DagsterRunStatsSnapshot,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    PARENT_RUN_ID_TAG,\n    PARTITION_NAME_TAG,\n    RESUME_RETRY_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import PrintFn, traced\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    experimental_warning,\n)\n\nfrom .config import (\n    DAGSTER_CONFIG_YAML_FILENAME,\n    DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT,\n    get_default_tick_retention_settings,\n    get_tick_retention_settings,\n)\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n# Our internal guts can handle empty strings for job name and run id\n# However making these named constants for documentation, to encode where we are making the assumption,\n# and to allow us to change this more easily in the future, provided we are disciplined about\n# actually using this constants.\nRUNLESS_RUN_ID = ""\nRUNLESS_JOB_NAME = ""\n\nif TYPE_CHECKING:\n    from dagster._core.debug import DebugRunPayload\n    from dagster._core.definitions.asset_check_spec import AssetCheckKey\n    from dagster._core.definitions.job_definition import (\n        JobDefinition,\n    )\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.run_request import InstigatorType\n    from dagster._core.event_api import EventHandlerFn\n    from dagster._core.events import (\n        AssetMaterialization,\n        DagsterEvent,\n        DagsterEventType,\n        EngineEventData,\n    )\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.resume_retry import ReexecutionStrategy\n    from dagster._core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster._core.host_representation import (\n        CodeLocation,\n        ExternalJob,\n        ExternalJobOrigin,\n        ExternalSensor,\n        HistoricalJob,\n    )\n    from dagster._core.host_representation.external import ExternalSchedule\n    from dagster._core.launcher import RunLauncher\n    from dagster._core.run_coordinator import RunCoordinator\n    from dagster._core.scheduler import Scheduler, SchedulerDebugInfo\n    from dagster._core.scheduler.instigation import (\n        InstigatorState,\n        InstigatorStatus,\n        InstigatorTick,\n        TickData,\n        TickStatus,\n    )\n    from dagster._core.secrets import SecretsLoader\n    from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n    from dagster._core.storage.asset_check_execution_record import AssetCheckInstanceSupport\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.daemon_cursor import DaemonCursorStorage\n    from dagster._core.storage.event_log import EventLogStorage\n    from dagster._core.storage.event_log.base import (\n        AssetRecord,\n        EventLogConnection,\n        EventLogRecord,\n        EventRecordsFilter,\n    )\n    from dagster._core.storage.partition_status_cache import (\n        AssetPartitionStatus,\n        AssetStatusCacheValue,\n    )\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs import RunStorage\n    from dagster._core.storage.schedules import ScheduleStorage\n    from dagster._core.storage.sql import AlembicVersion\n    from dagster._core.workspace.workspace import IWorkspace\n    from dagster._daemon.types import DaemonHeartbeat, DaemonStatus\n\n\nDagsterInstanceOverrides: TypeAlias = Mapping[str, Any]\n\n\ndef _check_run_equality(\n    pipeline_run: DagsterRun, candidate_run: DagsterRun\n) -> Mapping[str, Tuple[Any, Any]]:\n    field_diff: Dict[str, Tuple[Any, Any]] = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Mapping[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance: "DagsterInstance"):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record: DagsterLogRecord) -> None:\n        from dagster._core.events import EngineEventData\n        from dagster._core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,  # type: ignore\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {e}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    job_name=event.job_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nT_DagsterInstance = TypeVar("T_DagsterInstance", bound="DagsterInstance", default="DagsterInstance")\n\n\nclass MayHaveInstanceWeakref(Generic[T_DagsterInstance]):\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    _instance_weakref: "Optional[weakref.ReferenceType[T_DagsterInstance]]"\n\n    def __init__(self):\n        self._instance_weakref = None\n\n    @property\n    def has_instance(self) -> bool:\n        return hasattr(self, "_instance_weakref") and (self._instance_weakref is not None)\n\n    @property\n    def _instance(self) -> T_DagsterInstance:\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        if instance is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to resolve undefined DagsterInstance weakref."\n            )\n        else:\n            return instance\n\n    def register_instance(self, instance: T_DagsterInstance) -> None:\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n@runtime_checkable\nclass DynamicPartitionsStore(Protocol):\n    @abstractmethod\n    def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]: ...\n\n    @abstractmethod\n    def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool: ...\n\n\n
[docs]class DagsterInstance(DynamicPartitionsStore):\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for dagster storage, you can write a ``dagster.yaml`` such as the\n following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster._core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (Optional[ComputeLogManager]): The compute log manager handles stdout\n and stderr logging for op compute functions. By default, this will be a\n :py:class:`dagster._core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (Optional[RunCoordinator]): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n # Stores TemporaryDirectory instances that were created for DagsterInstance.local_temp() calls\n # to be removed once the instance is garbage collected.\n _TEMP_DIRS: "weakref.WeakKeyDictionary[DagsterInstance, TemporaryDirectory]" = (\n weakref.WeakKeyDictionary()\n )\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n run_coordinator: Optional["RunCoordinator"],\n compute_log_manager: Optional["ComputeLogManager"],\n run_launcher: Optional["RunLauncher"],\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Mapping[str, Any]] = None,\n secrets_loader: Optional["SecretsLoader"] = None,\n ref: Optional[InstanceRef] = None,\n **_kwargs: Any, # we accept kwargs for forward-compat of custom instances\n ):\n from dagster._core.launcher import RunLauncher\n from dagster._core.run_coordinator import RunCoordinator\n from dagster._core.scheduler import Scheduler\n from dagster._core.secrets import SecretsLoader\n from dagster._core.storage.captured_log_manager import CapturedLogManager\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n from dagster._core.storage.event_log import EventLogStorage\n from dagster._core.storage.root import LocalArtifactStorage\n from dagster._core.storage.runs import RunStorage\n from dagster._core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n if compute_log_manager:\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n if not isinstance(self._compute_log_manager, CapturedLogManager):\n deprecation_warning(\n "ComputeLogManager",\n "1.2.0",\n "Implement the CapturedLogManager interface instead.",\n )\n self._compute_log_manager.register_instance(self)\n else:\n check.invariant(\n ref, "Compute log manager must be provided if instance is not from a ref"\n )\n self._compute_log_manager = None\n\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n if run_coordinator:\n self._run_coordinator = check.inst_param(\n run_coordinator, "run_coordinator", RunCoordinator\n )\n self._run_coordinator.register_instance(self)\n else:\n check.invariant(ref, "Run coordinator must be provided if instance is not from a ref")\n self._run_coordinator = None\n\n if run_launcher:\n self._run_launcher: Optional[RunLauncher] = check.inst_param(\n run_launcher, "run_launcher", RunLauncher\n )\n run_launcher.register_instance(self)\n else:\n check.invariant(ref, "Run launcher must be provided if instance is not from a ref")\n self._run_launcher = None\n\n self._settings = check.opt_mapping_param(settings, "settings")\n\n self._secrets_loader = check.opt_inst_param(secrets_loader, "secrets_loader", SecretsLoader)\n\n if self._secrets_loader:\n self._secrets_loader.register_instance(self)\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. Set"\n " max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed"\n " run worker will be marked as failed, but will not be resumed.",\n )\n\n if self.run_retries_enabled:\n check.invariant(\n self.event_log_storage.supports_event_consumer_queries(),\n "Run retries are enabled, but the configured event log storage does not support"\n " them. Consider switching to Postgres or Mysql.",\n )\n\n # ctors\n\n
[docs] @public\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None,\n preload: Optional[Sequence["DebugRunPayload"]] = None,\n settings: Optional[Dict] = None,\n ) -> "DagsterInstance":\n """Create a `DagsterInstance` suitable for ephemeral execution, useful in test contexts. An\n ephemeral instance uses mostly in-memory components. Use `local_temp` to create a test\n instance that is fully persistent.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n preload (Optional[Sequence[DebugRunPayload]]): A sequence of payloads to load into the\n instance's run storage. Useful for debugging.\n settings (Optional[Dict]): Settings for the instance.\n\n Returns:\n DagsterInstance: An ephemeral DagsterInstance.\n """\n from dagster._core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster._core.run_coordinator import DefaultRunCoordinator\n from dagster._core.storage.event_log import InMemoryEventLogStorage\n from dagster._core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster._core.storage.root import LocalArtifactStorage, TemporaryLocalArtifactStorage\n from dagster._core.storage.runs import InMemoryRunStorage\n\n if tempdir is not None:\n local_storage = LocalArtifactStorage(tempdir)\n else:\n local_storage = TemporaryLocalArtifactStorage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=local_storage,\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n settings=settings,\n )
\n\n
[docs] @public\n @staticmethod\n def get() -> "DagsterInstance":\n """Get the current `DagsterInstance` as specified by the ``DAGSTER_HOME`` environment variable.\n\n Returns:\n DagsterInstance: The current DagsterInstance.\n """\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n "The environment variable $DAGSTER_HOME is not set. \\nDagster requires this"\n " environment variable to be set to an existing directory in your filesystem. This"\n " directory is used to store metadata across sessions, or load the dagster.yaml"\n " file which can configure storing metadata in an external database.\\nYou can"\n " resolve this error by exporting the environment variable. For example, you can"\n " run the following command in your shell or include it in your shell configuration"\n ' file:\\n\\texport DAGSTER_HOME=~"/dagster_home"\\nor PowerShell\\n$env:DAGSTER_HOME'\n " = ($home + '\\\\dagster_home')or batchset"\n " DAGSTER_HOME=%UserProfile%/dagster_homeAlternatively, DagsterInstance.ephemeral()"\n " can be used for a transient instance.\\n"\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this'\n " environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)
\n\n
[docs] @public\n @staticmethod\n def local_temp(\n tempdir: Optional[str] = None,\n overrides: Optional[DagsterInstanceOverrides] = None,\n ) -> "DagsterInstance":\n """Create a DagsterInstance that uses a temporary directory for local storage. This is a\n regular, fully persistent instance. Use `ephemeral` to get an ephemeral instance with\n in-memory components.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n overrides (Optional[DagsterInstanceOverrides]): Override settings for the instance.\n\n Returns:\n DagsterInstance\n """\n if tempdir is None:\n created_dir = TemporaryDirectory()\n i = DagsterInstance.from_ref(\n InstanceRef.from_dir(created_dir.name, overrides=overrides)\n )\n DagsterInstance._TEMP_DIRS[i] = created_dir\n return i\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))
\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n unified_storage = instance_ref.storage\n run_storage = unified_storage.run_storage if unified_storage else instance_ref.run_storage\n event_storage = (\n unified_storage.event_log_storage if unified_storage else instance_ref.event_storage\n )\n schedule_storage = (\n unified_storage.schedule_storage if unified_storage else instance_ref.schedule_storage\n )\n\n return klass(\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=run_storage, # type: ignore # (possible none)\n event_storage=event_storage, # type: ignore # (possible none)\n schedule_storage=schedule_storage,\n compute_log_manager=None, # lazy load\n scheduler=instance_ref.scheduler,\n run_coordinator=None, # lazy load\n run_launcher=None, # lazy load\n settings=instance_ref.settings,\n secrets_loader=instance_ref.secrets_loader,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg=(\n "\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else ""\n ),\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n def _info(self, component: object) -> Union[str, Mapping[Any, Any]]:\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name: str, component: object) -> str:\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self) -> Mapping[str, object]:\n settings: Mapping[str, object] = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self.run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n def schema_str(self) -> str:\n def _schema_dict(alembic_version: "AlembicVersion") -> Optional[Mapping[str, object]]:\n if not alembic_version:\n return None\n db_revision, head_revision = alembic_version\n return {\n "current": db_revision,\n "latest": head_revision,\n }\n\n return yaml.dump(\n {\n "schema": {\n "event_log_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "run_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "schedule_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n }\n },\n default_flow_style=False,\n sort_keys=False,\n )\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n @property\n def daemon_cursor_storage(self) -> "DaemonCursorStorage":\n return self._run_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n # Lazily load in case the run coordinator requires dependencies that are not available\n # everywhere that loads the instance\n if not self._run_coordinator:\n check.invariant(\n self._ref, "Run coordinator not provided, and no instance ref available"\n )\n run_coordinator = cast(InstanceRef, self._ref).run_coordinator\n check.invariant(run_coordinator, "Run coordinator not configured in instance ref")\n self._run_coordinator = cast("RunCoordinator", run_coordinator)\n self._run_coordinator.register_instance(self)\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n # Lazily load in case the launcher requires dependencies that are not available everywhere\n # that loads the instance (e.g. The EcsRunLauncher requires boto3)\n if not self._run_launcher:\n check.invariant(self._ref, "Run launcher not provided, and no instance ref available")\n launcher = cast(InstanceRef, self._ref).run_launcher\n check.invariant(launcher, "Run launcher not configured in instance ref")\n self._run_launcher = cast("RunLauncher", launcher)\n self._run_launcher.register_instance(self)\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n if not self._compute_log_manager:\n check.invariant(\n self._ref, "Compute log manager not provided, and no instance ref available"\n )\n compute_log_manager = cast(InstanceRef, self._ref).compute_log_manager\n check.invariant(\n compute_log_manager, "Compute log manager not configured in instance ref"\n )\n self._compute_log_manager = cast("ComputeLogManager", compute_log_manager)\n self._compute_log_manager.register_instance(self)\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n else:\n return dagster_telemetry_enabled_default\n\n @property\n def nux_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n nux_enabled_by_default = True\n\n nux_settings = self.get_settings("nux")\n if not nux_settings:\n return nux_enabled_by_default\n\n if "enabled" in nux_settings:\n return nux_settings["enabled"]\n else:\n return nux_enabled_by_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Any:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def run_monitoring_cancel_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("cancel_timeout_seconds", 180)\n\n @property\n def code_server_settings(self) -> Any:\n return self.get_settings("code_servers")\n\n @property\n def code_server_process_startup_timeout(self) -> int:\n return self.code_server_settings.get(\n "local_startup_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def code_server_reload_timeout(self) -> int:\n return self.code_server_settings.get(\n "reload_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def wait_for_local_code_server_processes_on_shutdown(self) -> bool:\n return self.code_server_settings.get("wait_for_local_processes_on_shutdown", False)\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n return self.run_monitoring_settings.get("max_resume_run_attempts", 0)\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n @property\n def run_retries_enabled(self) -> bool:\n return self.get_settings("run_retries").get("enabled", False)\n\n @property\n def run_retries_max_retries(self) -> int:\n return self.get_settings("run_retries").get("max_retries")\n\n @property\n def auto_materialize_enabled(self) -> bool:\n return self.get_settings("auto_materialize").get("enabled", True)\n\n @property\n def auto_materialize_minimum_interval_seconds(self) -> int:\n return self.get_settings("auto_materialize").get("minimum_interval_seconds")\n\n @property\n def auto_materialize_run_tags(self) -> Dict[str, str]:\n return self.get_settings("auto_materialize").get("run_tags", {})\n\n @property\n def auto_materialize_respect_materialization_data_versions(self) -> bool:\n return self.get_settings("auto_materialize").get(\n "respect_materialization_data_versions", False\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> Sequence[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n loggers: Sequence[str] = python_log_settings.get("managed_python_loggers", [])\n return loggers\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn: Optional[PrintFn] = None) -> None:\n from dagster._core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade() # type: ignore # (unknown method on run storage)\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade() # type: ignore # (possible none)\n self._schedule_storage.migrate(print_fn) # type: ignore # (possible none)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n if self._schedule_storage:\n self._schedule_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._run_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._event_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n\n def reindex(self, print_fn: PrintFn = lambda _: None) -> None:\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn) # type: ignore # (possible none)\n print_fn("Done.")\n\n def dispose(self) -> None:\n self._local_artifact_storage.dispose()\n self._run_storage.dispose()\n if self._run_coordinator:\n self._run_coordinator.dispose()\n if self._run_launcher:\n self._run_launcher.dispose()\n self._event_storage.dispose()\n if self._compute_log_manager:\n self._compute_log_manager.dispose()\n if self._secrets_loader:\n self._secrets_loader.dispose()\n\n if self in DagsterInstance._TEMP_DIRS:\n DagsterInstance._TEMP_DIRS[self].cleanup()\n del DagsterInstance._TEMP_DIRS[self]\n\n # run storage\n
[docs] @public\n def get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n """Get a :py:class:`DagsterRun` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run to retrieve.\n\n Returns:\n Optional[DagsterRun]: The run corresponding to the given id. If no run matching the id\n is found, return `None`.\n """\n record = self.get_run_record_by_id(run_id)\n if record is None:\n return None\n return record.dagster_run
\n\n
[docs] @public\n @traced\n def get_run_record_by_id(self, run_id: str) -> Optional[RunRecord]:\n """Get a :py:class:`RunRecord` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run record to retrieve.\n\n Returns:\n Optional[RunRecord]: The run record corresponding to the given id. If no run matching\n the id is found, return `None`.\n """\n records = self._run_storage.get_run_records(RunsFilter(run_ids=[run_id]))\n if not records:\n return None\n return records[0]
\n\n @traced\n def get_job_snapshot(self, snapshot_id: str) -> "JobSnapshot":\n return self._run_storage.get_job_snapshot(snapshot_id)\n\n @traced\n def has_job_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_job(self, snapshot_id: str) -> "HistoricalJob":\n from dagster._core.host_representation import HistoricalJob\n\n snapshot = self._run_storage.get_job_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_job_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalJob(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_job(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> DagsterRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags(\n tag_keys=tag_keys, value_prefix=value_prefix, limit=limit\n )\n\n @traced\n def get_run_tag_keys(self) -> Sequence[str]:\n return self._run_storage.get_run_tag_keys()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_job(\n self,\n job_def: "JobDefinition",\n execution_plan: Optional["ExecutionPlan"] = None,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n status: Optional[Union[DagsterRunStatus, str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n repository_load_data: Optional["RepositoryLoadData"] = None,\n ) -> DagsterRun:\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.execution.api import create_execution_plan\n from dagster._core.execution.plan.plan import ExecutionPlan\n from dagster._core.snap import snapshot_from_execution_plan\n\n check.inst_param(job_def, "pipeline_def", JobDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that op_selection is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # op_selection is not required and will not be converted to op_selection here.\n # i.e. this function doesn't handle solid queries.\n # op_selection is only used to pass the user queries further down.\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_list_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # op_selection never provided\n if asset_selection or op_selection:\n # for cases when `create_run_for_pipeline` is directly called\n job_def = job_def.get_subset(\n asset_selection=asset_selection,\n op_selection=op_selection,\n )\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n job=job_def,\n run_config=run_config,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n return self.create_run(\n job_name=job_def.name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=None,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus(status) if status else None,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_def.get_job_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n job_def.get_job_snapshot_id(),\n ),\n parent_job_snapshot=job_def.get_parent_job_snapshot(),\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n status: Optional[DagsterRunStatus],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]] = None,\n op_selection: Optional[Sequence[str]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags = {\n **tags,\n AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat(),\n }\n\n check.invariant(\n not (not job_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot."\n " It is possible to have no execution plan snapshot since we persist runs that do"\n " not successfully compile execution plans in the scheduled case.",\n )\n\n job_snapshot_id = (\n self._ensure_persisted_job_snapshot(job_snapshot, parent_job_snapshot)\n if job_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, job_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and job_snapshot_id\n else None\n )\n\n return DagsterRun(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot_id=job_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n has_repository_load_data=execution_plan_snapshot is not None\n and execution_plan_snapshot.repository_load_data is not None,\n )\n\n def _ensure_persisted_job_snapshot(\n self,\n job_snapshot: "JobSnapshot",\n parent_job_snapshot: "Optional[JobSnapshot]",\n ) -> str:\n from dagster._core.snap import JobSnapshot, create_job_snapshot_id\n\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if job_snapshot.lineage_snapshot:\n if not self._run_storage.has_job_snapshot(\n job_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_job_snapshot_id(parent_job_snapshot) # type: ignore # (possible none)\n == job_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(\n parent_job_snapshot # type: ignore # (possible none)\n )\n check.invariant(\n job_snapshot.lineage_snapshot.parent_snapshot_id == returned_job_snapshot_id\n )\n\n job_snapshot_id = create_job_snapshot_id(job_snapshot)\n if not self._run_storage.has_job_snapshot(job_snapshot_id):\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(job_snapshot)\n check.invariant(job_snapshot_id == returned_job_snapshot_id)\n\n return job_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self,\n execution_plan_snapshot: "ExecutionPlanSnapshot",\n job_snapshot_id: str,\n step_keys_to_execute: Optional[Sequence[str]],\n ) -> str:\n from dagster._core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(job_snapshot_id, "job_snapshot_id")\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.job_snapshot_id == job_snapshot_id,\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n f'"{execution_plan_snapshot.job_snapshot_id}" and snapshot_id created in memory is '\n f'"{job_snapshot_id}"',\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_planned_events(\n self, dagster_run: DagsterRun, execution_plan_snapshot: "ExecutionPlanSnapshot"\n ) -> None:\n from dagster._core.events import (\n AssetMaterializationPlannedData,\n DagsterEvent,\n DagsterEventType,\n )\n\n job_name = dagster_run.job_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = check.not_none(output.properties).asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n partition_tag = dagster_run.tags.get(PARTITION_NAME_TAG)\n partition_range_start, partition_range_end = dagster_run.tags.get(\n ASSET_PARTITION_RANGE_START_TAG\n ), dagster_run.tags.get(ASSET_PARTITION_RANGE_END_TAG)\n\n if partition_tag and (partition_range_start or partition_range_end):\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set along with"\n f" {PARTITION_NAME_TAG}"\n )\n\n if partition_range_start or partition_range_end:\n if not partition_range_start or not partition_range_end:\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set without the other"\n )\n\n # TODO: resolve which partitions are in the range, and emit an event for each\n\n partition = (\n partition_tag\n if check.not_none(output.properties).is_asset_partitioned\n else None\n )\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to materialize asset {asset_key.to_string()}"\n ),\n event_specific_data=AssetMaterializationPlannedData(\n asset_key, partition=partition\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n if check.not_none(output.properties).asset_check_key:\n asset_check_key = check.not_none(\n check.not_none(output.properties).asset_check_key\n )\n target_asset_key = asset_check_key.asset_key\n check_name = asset_check_key.name\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to execute asset check {check_name} on"\n f" asset {target_asset_key.to_string()}"\n ),\n event_specific_data=AssetCheckEvaluationPlanned(\n target_asset_key,\n check_name=check_name,\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n def create_run(\n self,\n *,\n job_name: str,\n run_id: Optional[str],\n run_config: Optional[Mapping[str, object]],\n status: Optional[DagsterRunStatus],\n tags: Optional[Mapping[str, Any]],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n step_keys_to_execute: Optional[Sequence[str]],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n job_snapshot: Optional["JobSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]],\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]],\n resolved_op_selection: Optional[AbstractSet[str]],\n op_selection: Optional[Sequence[str]],\n external_job_origin: Optional["ExternalJobOrigin"],\n job_code_origin: Optional[JobPythonOrigin],\n ) -> DagsterRun:\n from dagster._core.definitions.asset_check_spec import AssetCheckKey\n from dagster._core.definitions.utils import validate_tags\n from dagster._core.host_representation.origin import ExternalJobOrigin\n from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n\n check.str_param(job_name, "job_name")\n check.opt_str_param(\n run_id, "run_id"\n ) # will be assigned to make_new_run_id() lower in callstack\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.opt_inst_param(status, "status", DagsterRunStatus)\n check.opt_mapping_param(tags, "tags", key_type=str)\n\n validated_tags = validate_tags(tags)\n\n check.opt_str_param(root_run_id, "root_run_id")\n check.opt_str_param(parent_run_id, "parent_run_id")\n\n # If step_keys_to_execute is None, then everything is executed. In some cases callers\n # are still exploding and sending the full list of step keys even though that is\n # unnecessary.\n\n check.opt_sequence_param(step_keys_to_execute, "step_keys_to_execute")\n check.opt_inst_param(\n execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n )\n\n if root_run_id or parent_run_id:\n check.invariant(\n root_run_id and parent_run_id,\n "If root_run_id or parent_run_id is passed, this is a re-execution scenario and"\n " root_run_id and parent_run_id must both be passed.",\n )\n\n # The job_snapshot should always be set in production scenarios. In tests\n # we have sometimes omitted it out of convenience.\n\n check.opt_inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if parent_job_snapshot:\n check.invariant(\n job_snapshot,\n "If parent_job_snapshot is set, job_snapshot should also be.",\n )\n\n # op_selection is a sequence of selection queries assigned by the user.\n # *Most* callers expand the op_selection into an explicit set of\n # resolved_op_selection via accessing external_job.resolved_op_selection\n # but not all do. Some (launch execution mutation in graphql and backfill run\n # creation, for example) actually pass the solid *selection* into the\n # resolved_op_selection parameter, but just as a frozen set, rather than\n # fully resolving the selection, as the daemon launchers do. Given the\n # state of callers we just check to ensure that the arguments are well-formed.\n #\n # asset_selection adds another dimension to this lovely dance. op_selection\n # and asset_selection are mutually exclusive and should never both be set.\n # This is invariant is checked in a sporadic fashion around\n # the codebase, but is never enforced in a typed fashion.\n #\n # Additionally, the way that callsites currently behave *if* asset selection\n # is set (i.e., not None) then *neither* op_selection *nor*\n # resolved_op_selection is passed. In the asset selection case resolving\n # the set of assets into the canonical resolved_op_selection is done in\n # the user process, and the exact resolution is never persisted in the run.\n # We are asserting that invariant here to maintain that behavior.\n #\n # Finally, asset_check_selection can be passed along with asset_selection. It\n # is mutually exclusive with op_selection and resolved_op_selection. A `None`\n # value will include any asset checks that target selected assets. An empty set\n # will include no asset checks.\n\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", of_type=AssetCheckKey)\n\n if asset_selection is not None or asset_check_selection is not None:\n check.invariant(\n op_selection is None,\n "Cannot pass op_selection with either of asset_selection or asset_check_selection",\n )\n\n check.invariant(\n resolved_op_selection is None,\n "Cannot pass resolved_op_selection with either of asset_selection or"\n " asset_check_selection",\n )\n\n # The "python origin" arguments exist so a job can be reconstructed in memory\n # after a DagsterRun has been fetched from the database.\n #\n # There are cases (notably in _logged_execute_job with Reconstructable jobs)\n # where job_code_origin and is not. In some cloud test cases only\n # external_job_origin is passed But they are almost always passed together.\n # If these are not set the created run will never be able to be relaunched from\n # the information just in the run or in another process.\n\n check.opt_inst_param(external_job_origin, "external_job_origin", ExternalJobOrigin)\n check.opt_inst_param(job_code_origin, "job_code_origin", JobPythonOrigin)\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id, # type: ignore # (possible none)\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=validated_tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n dagster_run = self._run_storage.add_run(dagster_run)\n\n if execution_plan_snapshot:\n self._log_asset_planned_events(dagster_run, execution_plan_snapshot)\n\n return dagster_run\n\n def create_reexecuted_run(\n self,\n *,\n parent_run: DagsterRun,\n code_location: "CodeLocation",\n external_job: "ExternalJob",\n strategy: "ReexecutionStrategy",\n extra_tags: Optional[Mapping[str, Any]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n use_parent_run_tags: bool = False,\n ) -> DagsterRun:\n from dagster._core.execution.plan.resume_retry import (\n ReexecutionStrategy,\n )\n from dagster._core.execution.plan.state import KnownExecutionState\n from dagster._core.host_representation import CodeLocation, ExternalJob\n\n check.inst_param(parent_run, "parent_run", DagsterRun)\n check.inst_param(code_location, "code_location", CodeLocation)\n check.inst_param(external_job, "external_job", ExternalJob)\n check.inst_param(strategy, "strategy", ReexecutionStrategy)\n check.opt_mapping_param(extra_tags, "extra_tags", key_type=str)\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.bool_param(use_parent_run_tags, "use_parent_run_tags")\n\n root_run_id = parent_run.root_run_id or parent_run.run_id\n parent_run_id = parent_run.run_id\n\n tags = merge_dicts(\n external_job.tags,\n (\n # these can differ from external_job.tags if tags were added at launch time\n parent_run.tags\n if use_parent_run_tags\n else {}\n ),\n extra_tags or {},\n {\n PARENT_RUN_ID_TAG: parent_run_id,\n ROOT_RUN_ID_TAG: root_run_id,\n },\n )\n\n run_config = run_config if run_config is not None else parent_run.run_config\n\n if strategy == ReexecutionStrategy.FROM_FAILURE:\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n\n (\n step_keys_to_execute,\n known_state,\n ) = KnownExecutionState.build_resume_retry_reexecution(\n self,\n parent_run=parent_run,\n )\n tags[RESUME_RETRY_TAG] = "true"\n elif strategy == ReexecutionStrategy.ALL_STEPS:\n step_keys_to_execute = None\n known_state = None\n else:\n raise DagsterInvariantViolationError(f"Unknown reexecution strategy: {strategy}")\n\n external_execution_plan = code_location.get_external_execution_plan(\n external_job,\n run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance=self,\n )\n\n return self.create_run(\n job_name=parent_run.job_name,\n run_id=None,\n run_config=run_config,\n resolved_op_selection=parent_run.resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.NOT_STARTED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=external_job.job_snapshot,\n execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,\n parent_job_snapshot=external_job.parent_job_snapshot,\n op_selection=parent_run.op_selection,\n asset_selection=parent_run.asset_selection,\n asset_check_selection=parent_run.asset_check_selection,\n external_job_origin=external_job.get_external_origin(),\n job_code_origin=external_job.get_python_origin(),\n )\n\n def register_managed_run(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n op_selection: Optional[Sequence[str]] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # DagsterRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(dagster_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n job_code_origin=job_code_origin,\n )\n\n def get_run() -> DagsterRun:\n candidate_run = self.get_run_by_id(dagster_run.run_id)\n\n field_diff = _check_run_equality(dagster_run, candidate_run) # type: ignore # (possible none)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=dagster_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run # type: ignore # (possible none)\n\n if self.has_run(dagster_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(dagster_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n return self._run_storage.add_run(dagster_run)\n\n @traced\n def add_snapshot(\n self,\n snapshot: Union["JobSnapshot", "ExecutionPlanSnapshot"],\n snapshot_id: Optional[str] = None,\n ) -> None:\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent") -> None:\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n return self._run_storage.get_run_ids(filters, cursor=cursor, limit=limit)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n
[docs] @public\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )
\n\n @traced\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n return self._run_storage.get_run_partition_data(runs_filter)\n\n def wipe(self) -> None:\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n
[docs] @public\n @traced\n def delete_run(self, run_id: str) -> None:\n """Delete a run and all events generated by that from storage.\n\n Args:\n run_id (str): The id of the run to delete.\n """\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)
\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id: str,\n cursor: Optional[int] = None,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self,\n run_id: str,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n @traced\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> "EventLogConnection":\n return self._event_storage.get_records_for_run(run_id, cursor, of_type, limit, ascending)\n\n def watch_event_logs(self, run_id: str, cursor: Optional[str], cb: "EventHandlerFn") -> None:\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id: str, cb: "EventHandlerFn") -> None:\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def can_cache_asset_status_data(self) -> bool:\n return self._event_storage.can_cache_asset_status_data()\n\n @traced\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n self._event_storage.update_asset_cached_status_data(asset_key, cache_values)\n\n @traced\n def wipe_asset_cached_status(self, asset_keys: Sequence[AssetKey]) -> None:\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset_cached_status(asset_key)\n\n @traced\n def all_asset_keys(self) -> Sequence[AssetKey]:\n return self._event_storage.all_asset_keys()\n\n
[docs] @public\n @traced\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n """Return a filtered subset of asset keys managed by this instance.\n\n Args:\n prefix (Optional[Sequence[str]]): Return only assets having this key prefix.\n limit (Optional[int]): Maximum number of keys to return.\n cursor (Optional[str]): Cursor to use for pagination.\n\n Returns:\n Sequence[AssetKey]: List of asset keys.\n """\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)
\n\n
[docs] @public\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n """Return true if this instance manages the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to check.\n """\n return self._event_storage.has_asset_key(asset_key)
\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n
[docs] @public\n @traced\n def get_latest_materialization_event(self, asset_key: AssetKey) -> Optional["EventLogEntry"]:\n """Fetch the latest materialization event for the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to return materialization for.\n\n Returns:\n Optional[AssetMaterialization]: The latest materialization event for the given asset\n key, or `None` if the asset has not been materialized.\n """\n return self._event_storage.get_latest_materialization_events([asset_key]).get(asset_key)
\n\n
[docs] @public\n @traced\n def get_event_records(\n self,\n event_records_filter: "EventRecordsFilter",\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)
\n\n
[docs] @public\n @traced\n def get_status_by_partition(\n self,\n asset_key: AssetKey,\n partition_keys: Sequence[str],\n partitions_def: "PartitionsDefinition",\n ) -> Optional[Mapping[str, "AssetPartitionStatus"]]:\n """Get the current status of provided partition_keys for the provided asset.\n\n Args:\n asset_key (AssetKey): The asset to get per-partition status for.\n partition_keys (Sequence[str]): The partitions to get status for.\n partitions_def (PartitionsDefinition): The PartitionsDefinition of the asset to get\n per-partition status for.\n\n Returns:\n Optional[Mapping[str, AssetPartitionStatus]]: status for each partition key\n\n """\n from dagster._core.storage.partition_status_cache import (\n AssetPartitionStatus,\n AssetStatusCacheValue,\n get_and_update_asset_status_cache_value,\n )\n\n cached_value = get_and_update_asset_status_cache_value(self, asset_key, partitions_def)\n\n if isinstance(cached_value, AssetStatusCacheValue):\n materialized_partitions = cached_value.deserialize_materialized_partition_subsets(\n partitions_def\n )\n failed_partitions = cached_value.deserialize_failed_partition_subsets(partitions_def)\n in_progress_partitions = cached_value.deserialize_in_progress_partition_subsets(\n partitions_def\n )\n\n status_by_partition = {}\n\n for partition_key in partition_keys:\n if partition_key in in_progress_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.IN_PROGRESS\n elif partition_key in failed_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.FAILED\n elif partition_key in materialized_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.MATERIALIZED\n else:\n status_by_partition[partition_key] = None\n\n return status_by_partition
\n\n
[docs] @public\n @traced\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence["AssetRecord"]:\n """Return an `AssetRecord` for each of the given asset keys.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): List of asset keys to retrieve records for.\n\n Returns:\n Sequence[AssetRecord]: List of asset records.\n """\n return self._event_storage.get_asset_records(asset_keys)
\n\n @traced\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, searches for the event with the provided event_id.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n return self._event_storage.get_event_tags_for_asset(asset_key, filter_tags, filter_event_id)\n\n
[docs] @public\n @traced\n def wipe_assets(self, asset_keys: Sequence[AssetKey]) -> None:\n """Wipes asset event history from the event log for the given asset keys.\n\n Args:\n asset_keys (Sequence[AssetKey]): Asset keys to wipe.\n """\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)
\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys, after_cursor)\n\n @traced\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n return self._event_storage.get_materialized_partitions(\n asset_key, before_cursor=before_cursor, after_cursor=after_cursor\n )\n\n @traced\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: "DagsterEventType"\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n return self._event_storage.get_latest_storage_id_by_partition(asset_key, event_type)\n\n
[docs] @public\n @traced\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the set of partition keys for the specified :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n return self._event_storage.get_dynamic_partitions(partitions_def_name)
\n\n
[docs] @public\n @traced\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add partitions to the specified :py:class:`DynamicPartitionsDefinition` idempotently.\n Does not add any partitions that already exist.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_keys (Sequence[str]): Partition keys to add.\n """\n from dagster._core.definitions.partition import (\n raise_error_on_invalid_partition_key_substring,\n )\n\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n if isinstance(partition_keys, str):\n # Guard against a single string being passed in `partition_keys`\n raise DagsterInvalidInvocationError("partition_keys must be a sequence of strings")\n raise_error_on_invalid_partition_key_substring(partition_keys)\n return self._event_storage.add_dynamic_partitions(partitions_def_name, partition_keys)
\n\n
[docs] @public\n @traced\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified :py:class:`DynamicPartitionsDefinition`.\n If the partition does not exist, exits silently.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to delete.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_key, "partition_key", of_type=str)\n self._event_storage.delete_dynamic_partition(partitions_def_name, partition_key)
\n\n
[docs] @public\n @traced\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a partition key exists for the :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to check.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.str_param(partition_key, "partition_key")\n return self._event_storage.has_dynamic_partition(partitions_def_name, partition_key)
\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self) -> Sequence[logging.Handler]:\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self) -> _EventListenerLogHandler:\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self) -> Sequence[logging.Handler]:\n handlers: List[logging.Handler] = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event: "EventLogEntry") -> None:\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event: "EventLogEntry") -> None:\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.get_dagster_event().is_job_event:\n self._run_storage.handle_run_event(run_id, event.get_dagster_event())\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id: str, cb) -> None:\n self._subscribers[run_id].append(cb)\n\n def report_engine_event(\n self,\n message: str,\n dagster_run: Optional[DagsterRun] = None,\n engine_event_data: Optional["EngineEventData"] = None,\n cls: Optional[Type[object]] = None,\n step_key: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n ) -> "DagsterEvent":\n """Report a EngineEvent that occurred outside of a job execution context."""\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(job_name, "job_name")\n\n check.invariant(\n dagster_run or (job_name and run_id),\n "Must include either dagster_run or job_name and run_id",\n )\n\n run_id = run_id if run_id else dagster_run.run_id # type: ignore\n job_name = job_name if job_name else dagster_run.job_name # type: ignore\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData({}),\n )\n\n if cls:\n message = f"[{cls.__name__}] {message}"\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=job_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n self.report_dagster_event(dagster_event, run_id=run_id, log_level=log_level)\n return dagster_event\n\n def report_dagster_event(\n self,\n dagster_event: "DagsterEvent",\n run_id: str,\n log_level: Union[str, int] = logging.INFO,\n ) -> None:\n """Takes a DagsterEvent and stores it in persistent storage for the corresponding DagsterRun."""\n from dagster._core.events.log import EventLogEntry\n\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n job_name=dagster_event.job_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=dagster_event.step_key,\n dagster_event=dagster_event,\n )\n self.handle_new_event(event_record)\n\n def report_run_canceling(self, run: DagsterRun, message: Optional[str] = None):\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(run, "run", DagsterRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n job_name=run.job_name,\n message=message,\n )\n self.report_dagster_event(canceling_event, run_id=run.run_id)\n\n def report_run_canceled(\n self,\n dagster_run: DagsterRun,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n def report_run_failed(\n self, dagster_run: DagsterRun, message: Optional[str] = None\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id: str) -> str:\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self) -> str:\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self) -> str:\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n def submit_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n from dagster._core.host_representation import ExternalJobOrigin\n from dagster._core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_job_origin,\n ExternalJobOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.job_code_origin,\n JobPythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self.run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster._core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run\n\n # Run launcher\n\n def launch_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster._core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n job_name=run.job_name,\n )\n self.report_dagster_event(launch_started_event, run_id=run.run_id)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self.run_launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int) -> DagsterRun:\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import EngineEventData\n from dagster._core.launcher import ResumeRunContext\n from dagster._daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self.run_launcher.resume_run(\n ResumeRunContext(\n dagster_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def count_resume_run_attempts(self, run_id: str) -> int:\n from dagster._daemon.monitoring import count_resume_run_attempts\n\n return count_resume_run_attempts(self, run_id)\n\n def run_will_resume(self, run_id: str) -> bool:\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule: "ExternalSchedule") -> "InstigatorState":\n return self._scheduler.start_schedule(self, external_schedule) # type: ignore\n\n def stop_schedule(\n self,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional["ExternalSchedule"],\n ) -> "InstigatorState":\n return self._scheduler.stop_schedule( # type: ignore\n self, schedule_origin_id, schedule_selector_id, external_schedule\n )\n\n def scheduler_debug_info(self) -> "SchedulerDebugInfo":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules: List[str] = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info: Mapping[str, Mapping[str, object]] = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(), # type: ignore\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor: "ExternalSensor") -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n if not stored_state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(\n self,\n instigator_origin_id: str,\n selector_id: str,\n external_sensor: Optional["ExternalSensor"],\n ) -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(instigator_origin_id, selector_id)\n computed_state: InstigatorState\n if external_sensor:\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n else:\n computed_state = check.not_none(stored_state)\n\n if not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_sensor\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional["InstigatorType"] = None,\n instigator_statuses: Optional[Set["InstigatorStatus"]] = None,\n ):\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type, instigator_statuses\n )\n\n @traced\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional["InstigatorState"]:\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id) # type: ignore # (possible none)\n\n @property\n def supports_batch_tick_queries(self) -> bool:\n return self._schedule_storage and self._schedule_storage.supports_batch_queries # type: ignore # (possible none)\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Sequence["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(\n self, origin_id: str, selector_id: str, timestamp: float\n ) -> Optional["InstigatorTick"]:\n matches = self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Sequence["InstigatorTick"]:\n return self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data: "TickData") -> "InstigatorTick":\n return check.not_none(self._schedule_storage).create_tick(tick_data)\n\n def update_tick(self, tick: "InstigatorTick"):\n return check.not_none(self._schedule_storage).update_tick(tick)\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> None:\n self._schedule_storage.purge_ticks(origin_id, selector_id, before, tick_statuses) # type: ignore # (possible none)\n\n def wipe_all_schedules(self) -> None:\n if self._scheduler:\n self._scheduler.wipe(self) # type: ignore # (possible none)\n\n self._schedule_storage.wipe() # type: ignore # (possible none)\n\n def logs_path_for_schedule(self, schedule_origin_id: str) -> str:\n return self._scheduler.get_logs_path(self, schedule_origin_id) # type: ignore # (possible none)\n\n def __enter__(self) -> Self:\n return self\n\n def __exit__(\n self,\n exception_type: Optional[Type[BaseException]],\n exception_value: Optional[BaseException],\n traceback: Optional[TracebackType],\n ) -> None:\n self.dispose()\n\n # dagster daemon\n def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat") -> None:\n """Called on a regular interval by the daemon."""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)\n\n def get_daemon_heartbeats(self) -> Mapping[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types."""\n return self._run_storage.get_daemon_heartbeats()\n\n def wipe_daemon_heartbeats(self) -> None:\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self) -> Sequence[str]:\n from dagster._core.run_coordinator import QueuedRunCoordinator\n from dagster._core.scheduler import DagsterDaemonScheduler\n from dagster._daemon.asset_daemon import AssetDaemon\n from dagster._daemon.auto_run_reexecution.event_log_consumer import EventLogConsumerDaemon\n from dagster._daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster._daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n if self.run_retries_enabled:\n daemons.append(EventLogConsumerDaemon.daemon_type())\n if self.auto_materialize_enabled:\n daemons.append(AssetDaemon.daemon_type())\n return daemons\n\n def get_daemon_statuses(\n self, daemon_types: Optional[Sequence[str]] = None\n ) -> Mapping[str, "DaemonStatus"]:\n """Get the current status of the daemons. If daemon_types aren't provided, defaults to all\n required types. Returns a dict of daemon type to status.\n """\n from dagster._daemon.controller import get_daemon_statuses\n\n check.opt_sequence_param(daemon_types, "daemon_types", of_type=str)\n return get_daemon_statuses(\n self, daemon_types=daemon_types or self.get_required_daemon_types(), ignore_errors=True\n )\n\n @property\n def daemon_skip_heartbeats_without_errors(self) -> bool:\n # If enabled, daemon threads won't write heartbeats unless they encounter an error. This is\n # enabled in cloud, where we don't need to use heartbeats to check if daemons are running, but\n # do need to surface errors to users. This is an optimization to reduce DB writes.\n return False\n\n # backfill\n def get_backfills(\n self,\n status: Optional["BulkActionStatus"] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence["PartitionBackfill"]:\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id: str) -> Optional["PartitionBackfill"]:\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """Gate on an experimental feature to start a thread that monitors for if the run should be canceled."""\n return False\n\n def get_tick_retention_settings(\n self, instigator_type: "InstigatorType"\n ) -> Mapping["TickStatus", int]:\n from dagster._core.definitions.run_request import InstigatorType\n\n retention_settings = self.get_settings("retention")\n\n if instigator_type == InstigatorType.SCHEDULE:\n tick_settings = retention_settings.get("schedule")\n elif instigator_type == InstigatorType.SENSOR:\n tick_settings = retention_settings.get("sensor")\n elif instigator_type == InstigatorType.AUTO_MATERIALIZE:\n tick_settings = retention_settings.get("auto_materialize")\n else:\n raise Exception(f"Unexpected instigator type {instigator_type}")\n\n default_tick_settings = get_default_tick_retention_settings(instigator_type)\n return get_tick_retention_settings(tick_settings, default_tick_settings)\n\n def inject_env_vars(self, location_name: Optional[str]) -> None:\n if not self._secrets_loader:\n return\n\n new_env = self._secrets_loader.get_secrets_for_environment(location_name)\n for k, v in new_env.items():\n os.environ[k] = v\n\n def get_latest_data_version_record(\n self,\n key: AssetKey,\n is_source: Optional[bool] = None,\n partition_key: Optional[str] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Optional["EventLogRecord"]:\n from dagster._core.event_api import EventRecordsFilter\n from dagster._core.events import DagsterEventType\n\n # When we cant don't know whether the requested key corresponds to a source or regular\n # asset, we need to retrieve both the latest observation and materialization for all assets.\n # If there is a materialization, it's a regular asset and we can ignore the observation.\n\n observation: Optional[EventLogRecord] = None\n if is_source or is_source is None:\n observations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n observation = next(iter(observations), None)\n\n materialization: Optional[EventLogRecord] = None\n if not is_source:\n materializations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n materialization = next(iter(materializations), None)\n\n return materialization or observation\n\n
[docs] @public\n def get_latest_materialization_code_versions(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[str]]:\n """Returns the code version used for the latest materialization of each of the provided\n assets.\n\n Args:\n asset_keys (Iterable[AssetKey]): The asset keys to find latest materialization code\n versions for.\n\n Returns:\n Mapping[AssetKey, Optional[str]]: A dictionary with a key for each of the provided asset\n keys. The values will be None if the asset has no materializations. If an asset does\n not have a code version explicitly assigned to its definitions, but was\n materialized, Dagster assigns the run ID as its code version.\n """\n result: Dict[AssetKey, Optional[str]] = {}\n latest_materialization_events = self.get_latest_materialization_events(asset_keys)\n for asset_key in asset_keys:\n event_log_entry = latest_materialization_events.get(asset_key)\n if event_log_entry is None:\n result[asset_key] = None\n else:\n data_provenance = extract_data_provenance_from_entry(event_log_entry)\n result[asset_key] = data_provenance.code_version if data_provenance else None\n\n return result
\n\n @experimental\n def report_runless_asset_event(\n self,\n asset_event: Union["AssetMaterialization", "AssetObservation", "AssetCheckEvaluation"],\n ):\n """Record an event log entry related to assets that does not belong to a Dagster run."""\n from dagster._core.events import (\n AssetMaterialization,\n AssetObservationData,\n DagsterEvent,\n DagsterEventType,\n StepMaterializationData,\n )\n\n if isinstance(asset_event, AssetMaterialization):\n event_type_value = DagsterEventType.ASSET_MATERIALIZATION.value\n data_payload = StepMaterializationData(asset_event)\n elif isinstance(asset_event, AssetCheckEvaluation):\n event_type_value = DagsterEventType.ASSET_CHECK_EVALUATION.value\n data_payload = asset_event\n elif isinstance(asset_event, AssetObservation):\n event_type_value = DagsterEventType.ASSET_OBSERVATION.value\n data_payload = AssetObservationData(asset_event)\n else:\n raise DagsterInvariantViolationError(\n f"Received unexpected asset event type {asset_event}, expected"\n " AssetMaterialization, AssetObservation or AssetCheckEvaluation"\n )\n\n return self.report_dagster_event(\n run_id=RUNLESS_RUN_ID,\n dagster_event=DagsterEvent(\n event_type_value=event_type_value,\n event_specific_data=data_payload,\n job_name=RUNLESS_JOB_NAME,\n ),\n )\n\n def get_asset_check_support(self) -> "AssetCheckInstanceSupport":\n from dagster._core.storage.asset_check_execution_record import AssetCheckInstanceSupport\n\n return (\n AssetCheckInstanceSupport.SUPPORTED\n if self.event_log_storage.supports_asset_checks\n else AssetCheckInstanceSupport.NEEDS_MIGRATION\n )
\n
", "current_page_name": "_modules/dagster/_core/instance", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance.ref

\nimport os\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Type\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster._serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance, DagsterInstanceOverrides\n    from dagster._core.launcher.base import RunLauncher\n    from dagster._core.run_coordinator.base import RunCoordinator\n    from dagster._core.scheduler.scheduler import Scheduler\n    from dagster._core.secrets.loader import SecretsLoader\n    from dagster._core.storage.base_storage import DagsterStorage\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.event_log.base import EventLogStorage\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs.base import RunStorage\n    from dagster._core.storage.schedules.base import ScheduleStorage\n\n\ndef compute_logs_directory(base: str) -> str:\n    return os.path.join(base, "storage")\n\n\ndef _runs_directory(base: str) -> str:\n    return os.path.join(base, "history", "")\n\n\ndef _event_logs_directory(base: str) -> str:\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base: str) -> str:\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field: Mapping[str, Any]) -> ConfigurableClassData:\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(\n    config_value: Mapping[str, Any], field_name: str, default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\ndef configurable_secrets_loader_data(\n    config_field: Mapping[str, Any], default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    if not config_field:\n        return default\n    elif "custom" in config_field:\n        return configurable_class_data(config_field["custom"])\n    else:\n        return None\n\n\ndef configurable_storage_data(\n    config_field: Mapping[str, Any], defaults: Mapping[str, Optional[ConfigurableClassData]]\n) -> Sequence[Optional[ConfigurableClassData]]:\n    storage_data: ConfigurableClassData\n    run_storage_data: Optional[ConfigurableClassData]\n    event_storage_data: Optional[ConfigurableClassData]\n    schedule_storage_data: Optional[ConfigurableClassData]\n\n    if not config_field:\n        storage_data = check.not_none(defaults.get("storage"))\n        run_storage_data = check.not_none(defaults.get("run_storage"))\n        event_storage_data = check.not_none(defaults.get("event_log_storage"))\n        schedule_storage_data = check.not_none(defaults.get("schedule_storage"))\n    elif "postgres" in config_field:\n        config_yaml = yaml.dump(config_field["postgres"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="DagsterPostgresStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "mysql" in config_field:\n        config_yaml = yaml.dump(config_field["mysql"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="DagsterMySQLStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "sqlite" in config_field:\n        base_dir = config_field["sqlite"]["base_dir"]\n        storage_data = ConfigurableClassData(\n            "dagster._core.storage.sqlite_storage",\n            "DagsterSqliteStorage",\n            yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n        )\n\n        # Back-compat fo the legacy storage field only works if the base_dir is a string\n        # (env var doesn't work since each storage has a different value for the base_dir field)\n        if isinstance(base_dir, str):\n            run_storage_data = ConfigurableClassData(\n                "dagster._core.storage.runs",\n                "SqliteRunStorage",\n                yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            event_storage_data = ConfigurableClassData(\n                "dagster._core.storage.event_log",\n                "SqliteEventLogStorage",\n                yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            schedule_storage_data = ConfigurableClassData(\n                "dagster._core.storage.schedules",\n                "SqliteScheduleStorage",\n                yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n            )\n        else:\n            run_storage_data = None\n            event_storage_data = None\n            schedule_storage_data = None\n    else:\n        storage_data = configurable_class_data(config_field["custom"])\n        storage_config_yaml = yaml.dump(\n            {\n                "module_name": storage_data.module_name,\n                "class_name": storage_data.class_name,\n                "config_yaml": storage_data.config_yaml,\n            },\n            default_flow_style=False,\n        )\n        run_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyRunStorage", storage_config_yaml\n        )\n        event_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyEventLogStorage", storage_config_yaml\n        )\n        schedule_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyScheduleStorage", storage_config_yaml\n        )\n\n    return [storage_data, run_storage_data, event_storage_data, schedule_storage_data]\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Mapping[str, object]),\n # Required for backwards compatibility, but going forward will be unused by new versions\n # of DagsterInstance, which instead will instead grab the constituent storages from the\n # unified `storage_data`, if it is populated.\n ("run_storage_data", Optional[ConfigurableClassData]),\n ("event_storage_data", Optional[ConfigurableClassData]),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n # unified storage field\n ("storage_data", Optional[ConfigurableClassData]),\n ("secrets_loader_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Mapping[str, object],\n run_storage_data: Optional[ConfigurableClassData],\n event_storage_data: Optional[ConfigurableClassData],\n schedule_storage_data: Optional[ConfigurableClassData],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n storage_data: Optional[ConfigurableClassData] = None,\n secrets_loader_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_mapping_param(settings, "settings", key_type=str),\n run_storage_data=check.opt_inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.opt_inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n storage_data=check.opt_inst_param(storage_data, "storage_data", ConfigurableClassData),\n secrets_loader_data=check.opt_inst_param(\n secrets_loader_data, "secrets_loader_data", ConfigurableClassData\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir: str) -> Mapping[str, Optional[ConfigurableClassData]]:\n default_run_storage_data = ConfigurableClassData(\n "dagster._core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n )\n default_event_log_storage_data = ConfigurableClassData(\n "dagster._core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n )\n default_schedule_storage_data = ConfigurableClassData(\n "dagster._core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n )\n\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster._core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "storage": ConfigurableClassData(\n "dagster._core.storage.sqlite_storage",\n "DagsterSqliteStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster._core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster._core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster._core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n # For back-compat, the default is actually set in the secrets_loader property above,\n # so that old clients loading new config don't try to load a class that they\n # don't recognize\n "secrets": None,\n # LEGACY DEFAULTS\n "run_storage": default_run_storage_data,\n "event_log_storage": default_event_log_storage_data,\n "schedule_storage": default_schedule_storage_data,\n }\n\n @staticmethod\n def from_dir(\n base_dir: str,\n *,\n config_dir: Optional[str] = None,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n overrides: Optional["DagsterInstanceOverrides"] = None,\n ) -> "InstanceRef":\n if config_dir is None:\n config_dir = base_dir\n\n overrides = check.opt_mapping_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n config_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys()) # type: ignore # (undefined method)\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir) # type: ignore # (undefined method)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n if (\n config_value.get("run_storage")\n or config_value.get("event_log_storage")\n or config_value.get("schedule_storage")\n ):\n # using legacy config, specifying config for each of the constituent storages, make sure\n # to create a composite storage\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n storage_data = ConfigurableClassData(\n module_name="dagster._core.storage.legacy_storage",\n class_name="CompositeStorage",\n config_yaml=yaml.dump(\n {\n "run_storage": {\n "module_name": run_storage_data.module_name, # type: ignore # (possible none)\n "class_name": run_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": run_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "event_log_storage": {\n "module_name": event_storage_data.module_name, # type: ignore # (possible none)\n "class_name": event_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": event_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "schedule_storage": {\n "module_name": schedule_storage_data.module_name, # type: ignore # (possible none)\n "class_name": schedule_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": schedule_storage_data.config_yaml, # type: ignore # (possible none)\n },\n },\n default_flow_style=False,\n ),\n )\n\n else:\n [\n storage_data,\n run_storage_data,\n event_storage_data,\n schedule_storage_data,\n ] = configurable_storage_data(\n config_value.get("storage"), defaults # type: ignore # (possible none)\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n if config_value.get("run_queue"):\n run_coordinator_data = configurable_class_data(\n {\n "module": "dagster.core.run_coordinator",\n "class": "QueuedRunCoordinator",\n "config": config_value["run_queue"],\n }\n )\n else:\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n secrets_loader_data = configurable_secrets_loader_data(\n config_value.get("secrets"), defaults["secrets"] # type: ignore # (possible none)\n )\n\n settings_keys = {\n "telemetry",\n "python_logs",\n "run_monitoring",\n "run_retries",\n "code_servers",\n "retention",\n "sensors",\n "schedules",\n "nux",\n "auto_materialize",\n }\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data, # type: ignore # (possible none)\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data, # type: ignore # (possible none)\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n storage_data=storage_data,\n secrets_loader_data=secrets_loader_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self) -> "LocalArtifactStorage":\n from dagster._core.storage.root import LocalArtifactStorage\n\n return self.local_artifact_storage_data.rehydrate(as_type=LocalArtifactStorage)\n\n @property\n def storage(self) -> Optional["DagsterStorage"]:\n from dagster._core.storage.base_storage import DagsterStorage\n\n return self.storage_data.rehydrate(as_type=DagsterStorage) if self.storage_data else None\n\n @property\n def run_storage(self) -> Optional["RunStorage"]:\n from dagster._core.storage.runs.base import RunStorage\n\n return (\n self.run_storage_data.rehydrate(as_type=RunStorage) if self.run_storage_data else None\n )\n\n @property\n def event_storage(self) -> Optional["EventLogStorage"]:\n from dagster._core.storage.event_log.base import EventLogStorage\n\n return (\n self.event_storage_data.rehydrate(as_type=EventLogStorage)\n if self.event_storage_data\n else None\n )\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n from dagster._core.storage.schedules.base import ScheduleStorage\n\n return (\n self.schedule_storage_data.rehydrate(as_type=ScheduleStorage)\n if self.schedule_storage_data\n else None\n )\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n\n return self.compute_logs_data.rehydrate(as_type=ComputeLogManager)\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n from dagster._core.scheduler.scheduler import Scheduler\n\n return self.scheduler_data.rehydrate(as_type=Scheduler) if self.scheduler_data else None\n\n @property\n def run_coordinator(self) -> Optional["RunCoordinator"]:\n from dagster._core.run_coordinator.base import RunCoordinator\n\n return (\n self.run_coordinator_data.rehydrate(as_type=RunCoordinator)\n if self.run_coordinator_data\n else None\n )\n\n @property\n def run_launcher(self) -> Optional["RunLauncher"]:\n from dagster._core.launcher.base import RunLauncher\n\n return (\n self.run_launcher_data.rehydrate(as_type=RunLauncher)\n if self.run_launcher_data\n else None\n )\n\n @property\n def secrets_loader(self) -> Optional["SecretsLoader"]:\n from dagster._core.secrets.loader import SecretsLoader\n\n # Defining a default here rather than in stored config to avoid\n # back-compat issues when loading the config on older versions where\n # EnvFileLoader was not defined\n return (\n self.secrets_loader_data.rehydrate(as_type=SecretsLoader)\n if self.secrets_loader_data\n else None\n )\n\n @property\n def custom_instance_class(self) -> Type["DagsterInstance"]:\n return ( # type: ignore # (ambiguous return type)\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self) -> Mapping[str, Any]:\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self) -> Mapping[str, Any]:\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/_core/instance/ref", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.instance"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance.ref"}, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance"}, "instance_for_test": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance_for_test

\nimport os\nimport sys\nimport tempfile\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Any, Iterator, Mapping, Optional\n\nimport yaml\n\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .._utils.env import environ\nfrom .._utils.merger import merge_dicts\nfrom .instance import DagsterInstance\n\n\n
[docs]@contextmanager\ndef instance_for_test(\n overrides: Optional[Mapping[str, Any]] = None,\n set_dagster_home: bool = True,\n temp_dir: Optional[str] = None,\n) -> Iterator[DagsterInstance]:\n """Creates a persistent :py:class:`~dagster.DagsterInstance` available within a context manager.\n\n When a context manager is opened, if no `temp_dir` parameter is set, a new\n temporary directory will be created for the duration of the context\n manager's opening. If the `set_dagster_home` parameter is set to True\n (True by default), the `$DAGSTER_HOME` environment variable will be\n overridden to be this directory (or the directory passed in by `temp_dir`)\n for the duration of the context manager being open.\n\n Args:\n overrides (Optional[Mapping[str, Any]]):\n Config to provide to instance (config format follows that typically found in an `instance.yaml` file).\n set_dagster_home (Optional[bool]):\n If set to True, the `$DAGSTER_HOME` environment variable will be\n overridden to be the directory used by this instance for the\n duration that the context manager is open. Upon the context\n manager closing, the `$DAGSTER_HOME` variable will be re-set to the original value. (Defaults to True).\n temp_dir (Optional[str]):\n The directory to use for storing local artifacts produced by the\n instance. If not set, a temporary directory will be created for\n the duration of the context manager being open, and all artifacts\n will be torn down afterward.\n """\n with ExitStack() as stack:\n if not temp_dir:\n temp_dir = stack.enter_context(tempfile.TemporaryDirectory())\n\n # wait for any grpc processes that created runs during test disposal to finish,\n # since they might also be using this instance's tempdir (and to keep each test\n # isolated / avoid race conditions in newer versions of grpcio when servers are\n # shutting down and spinning up at the same time)\n instance_overrides = merge_dicts(\n {\n "telemetry": {"enabled": False},\n "code_servers": {"wait_for_local_processes_on_shutdown": True},\n },\n (overrides if overrides else {}),\n )\n\n if set_dagster_home:\n stack.enter_context(\n environ({"DAGSTER_HOME": temp_dir, "DAGSTER_DISABLE_TELEMETRY": "yes"})\n )\n\n with open(os.path.join(temp_dir, "dagster.yaml"), "w", encoding="utf8") as fd:\n yaml.dump(instance_overrides, fd, default_flow_style=False)\n\n with DagsterInstance.from_config(temp_dir) as instance:\n try:\n yield instance\n except:\n sys.stderr.write(\n "Test raised an exception, attempting to clean up instance:"\n + serializable_error_info_from_exc_info(sys.exc_info()).to_string()\n + "\\n"\n )\n raise\n finally:\n cleanup_test_instance(instance)
\n\n\ndef cleanup_test_instance(instance: DagsterInstance) -> None:\n # To avoid filesystem contention when we close the temporary directory, wait for\n # all runs to reach a terminal state, and close any subprocesses or threads\n # that might be accessing the run history DB.\n\n # Since launcher is lazy loaded, we don't need to do anyting if it's None\n if instance._run_launcher: # noqa: SLF001\n instance._run_launcher.join() # noqa: SLF001\n
", "current_page_name": "_modules/dagster/_core/instance_for_test", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance_for_test"}, "launcher": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.workspace.workspace import IWorkspace\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """Context available within a run launcher's launch_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """Context available within a run launcher's resume_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """Result of a check_run_worker_health call."""\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n    transient: Optional[bool] = None\n    run_worker_id: Optional[str] = None  # Identifier for a particular run worker\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def terminate(self, run_id: str) -> bool:\n """Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self) -> None:\n """Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout: int = 30) -> None:\n pass\n\n @property\n def supports_check_run_worker_health(self) -> bool:\n """Whether the run launcher supports check_run_worker_health."""\n return False\n\n def check_run_worker_health(self, run: DagsterRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n return None\n\n @property\n def supports_resume_run(self) -> bool:\n """Whether the run launcher supports resume_run."""\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/_core/launcher/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.default_run_launcher

\nimport time\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, cast\n\nfrom typing_extensions import Self\n\nimport dagster._seven as seven\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterLaunchFailedError,\n    DagsterUserCodeProcessError,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import GRPC_INFO_TAG\nfrom dagster._serdes import (\n    ConfigurableClass,\n    deserialize_value,\n)\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.merger import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n    from dagster._grpc.client import DagsterGrpcClient\n\n\n# note: this class is a top level export, so we defer many imports til use for performance\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = inst_data\n\n self._run_ids = set()\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DefaultRunLauncher(inst_data=inst_data)\n\n @staticmethod\n def launch_run_from_grpc_client(\n instance: "DagsterInstance", run: DagsterRun, grpc_client: "DagsterGrpcClient"\n ):\n # defer for perf\n from dagster._grpc.types import ExecuteExternalJobArgs, StartRunResult\n\n instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": grpc_client.host},\n (\n {"port": grpc_client.port}\n if grpc_client.port\n else {"socket": grpc_client.socket}\n ),\n ({"use_ssl": True} if grpc_client.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_value(\n grpc_client.start_run(\n ExecuteExternalJobArgs(\n job_origin=run.external_job_origin, # type: ignore # (possible none)\n run_id=run.run_id,\n instance_ref=instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n # defer for perf\n from dagster._core.host_representation.code_location import (\n GrpcServerCodeLocation,\n )\n\n run = context.dagster_run\n\n check.inst_param(run, "run", DagsterRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_job_origin = check.not_none(run.external_job_origin)\n code_location = context.workspace.get_code_location(\n external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n check.inst(\n code_location,\n GrpcServerCodeLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n DefaultRunLauncher.launch_run_from_grpc_client(\n self._instance, run, cast(GrpcServerCodeLocation, code_location).client\n )\n\n self._run_ids.add(run.run_id)\n\n def _get_grpc_client_for_termination(self, run_id):\n # defer for perf\n from dagster._grpc.client import DagsterGrpcClient\n\n if not self.has_instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def terminate(self, run_id):\n # defer for perf\n from dagster._grpc.types import CancelExecutionRequest, CancelExecutionResult\n\n check.str_param(run_id, "run_id")\n if not self.has_instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n res = deserialize_value(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id)), CancelExecutionResult\n )\n\n if res.serializable_error_info:\n raise DagsterUserCodeProcessError.from_error_info(res.serializable_error_info)\n\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self.has_instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(f"Timed out waiting for these runs to finish: {active_run_ids!r}")\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2
\n
", "current_page_name": "_modules/dagster/_core/launcher/default_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nfrom typing_extensions import Protocol\n\nimport dagster._check as check\nfrom dagster._core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster._utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance\n    from dagster._core.events import DagsterEvent\n    from dagster._core.storage.dagster_run import DagsterRun\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass IDagsterMeta(Protocol):\n    @property\n    def dagster_meta(self) -> "DagsterLoggingMetadata": ...\n\n\n# The type-checker complains here that DagsterLogRecord does not implement the `dagster_meta`\n# property of `IDagsterMeta`. We ignore this error because we don't need to implement this method--\n# `DagsterLogRecord` is a stub class that is never instantiated. We only ever cast\n# `logging.LogRecord` objects to `DagsterLogRecord`, because it gives us typed access to the\n# `dagster_meta` property. `dagster_meta` itself is set on these `logging.LogRecord` objects via the\n# `extra` argument to `logging.Logger.log` (see `DagsterLogManager.log_dagster_event`), but\n# `logging.LogRecord` has no way of exposing to the type-checker the attributes that are dynamically\n# defined via `extra`.\nclass DagsterLogRecord(logging.LogRecord, IDagsterMeta):  # type: ignore\n    pass\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message."""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp,\n                "log_timestamp",\n                default=datetime.datetime.utcnow().isoformat(),\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return f'\\n\\n{getattr(event_specific_data, "error_display_string", error.to_string())}'\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("job_name", Optional[str]),\n            ("job_tags", Mapping[str, str]),\n            ("step_key", Optional[str]),\n            ("op_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.).\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        job_name: Optional[str] = None,\n        job_tags: Optional[Mapping[str, str]] = None,\n        step_key: Optional[str] = None,\n        op_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            job_name=job_name,\n            job_tags=job_tags or {},\n            step_key=step_key,\n            op_name=op_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self) -> str:\n        if self.resource_name is None:\n            return self.job_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def all_tags(self) -> Mapping[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n    def event_tags(self) -> Mapping[str, str]:\n        # Exclude pipeline_tags since it can be quite large and can be found on the run\n        return {k: str(v) for k, v in self._asdict().items() if k != "job_tags"}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n    from dagster._core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Mapping[str, object]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: Sequence[logging.Logger],\n        handlers: Sequence[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self) -> DagsterLoggingMetadata:\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags: str) -> "DagsterLogHandler":\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Mapping[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + [\n            "message",\n            "asctime",\n        ]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> DagsterLogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        # DagsterLogRecord is a LogRecord with a `dagster_meta` field\n        return cast(DagsterLogRecord, record)\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord) -> None:\n        """For any received record, add Dagster metadata, and have handlers handle it."""\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[Sequence[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_sequence_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: Sequence[logging.Logger],\n handlers: Optional[Sequence[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n dagster_run: Optional["DagsterRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n handlers = check.opt_sequence_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers = [*handlers, *instance.get_handlers()]\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if dagster_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=dagster_run.run_id,\n job_name=dagster_run.job_name,\n job_tags=dagster_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(\n self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"\n ) -> None:\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level: Union[str, int], msg: object, *args: Any, **kwargs: Any) -> None:\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags: str) -> "DagsterLogManager":\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/_core/log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.log_manager"}, "pipes": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.client

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, List, Optional, Sequence\n\nfrom dagster_pipes import (\n    DagsterPipesError,\n    PipesContextData,\n    PipesExtras,\n    PipesParams,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.result import MaterializeResult\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom .context import PipesExecutionResult\n\nif TYPE_CHECKING:\n    from .context import PipesMessageHandler\n\n\n
[docs]@experimental\nclass PipesClient(ABC):\n """Pipes client base class.\n\n Pipes clients for specific external environments should subclass this.\n """\n\n
[docs] @public\n @abstractmethod\n def run(\n self,\n *,\n context: OpExecutionContext,\n extras: Optional[PipesExtras] = None,\n **kwargs,\n ) -> "PipesClientCompletedInvocation":\n """Synchronously execute an external process with the pipes protocol. Derived\n clients must have `context` and `extras` arguments, but also can add arbitrary\n arguments that are appropriate for their own implementation.\n\n Args:\n context (OpExecutionContext): The context from the executing op/asset.\n extras (Optional[PipesExtras]): Arbitrary data to pass to the external environment.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """
\n\n\n@experimental\nclass PipesClientCompletedInvocation:\n def __init__(self, results: Sequence["PipesExecutionResult"]):\n self._results = results\n\n def get_results(self) -> Sequence["PipesExecutionResult"]:\n """Get the stream of results as a Sequence of a completed pipes\n client invocation. For each "report" call in the external process,\n one result object will be in the list.\n\n Returns: Sequence[PipesExecutionResult]\n """\n return tuple(self._results)\n\n def get_materialize_result(self) -> MaterializeResult:\n """Get a single materialize result for a pipes invocation. This coalesces\n the materialization result and any separately reported asset check results from\n the external process.\n\n This does not work on invocations that materialize multiple assets and will fail\n in that case. For multiple assets use `get_results` instead to get the result stream.\n\n Returns: MaterializeResult\n """\n return materialize_result_from_pipes_results(self.get_results())\n\n def get_asset_check_result(self) -> AssetCheckResult:\n """Get a single asset check result for a pipes invocation.\n\n This does not work on invocations that have anything except a single asset check result.\n Use `get_results` instead to get the result stream in those cases.\n\n Returns: AssetCheckResult\n """\n return _check_result_from_pipes_results(self.get_results())\n\n\n
[docs]@experimental\nclass PipesContextInjector(ABC):\n @abstractmethod\n @contextmanager\n def inject_context(self, context_data: "PipesContextData") -> Iterator[PipesParams]:\n """A `@contextmanager` that injects context data into the external process.\n\n This method should write the context data to a location accessible to the external\n process. It should yield parameters that the external process can use to locate and load the\n context data.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A JSON-serializable dict of parameters to be used used by the external\n process to locate and load the injected context data.\n """\n\n @abstractmethod\n def no_messages_debug_text(self) -> str:\n """A message to be displayed when no messages are received from the external process to aid with debugging.\n\n Example: "Attempted to inject context using a magic portal. Expected PipesMagicPortalContextLoader to be\n explicitly passed to open_dagster_pipes in the external process."\n """
\n\n\n
[docs]@experimental\nclass PipesMessageReader(ABC):\n @abstractmethod\n @contextmanager\n def read_messages(self, handler: "PipesMessageHandler") -> Iterator[PipesParams]:\n """A `@contextmanager` that reads messages reported by an external process.\n\n This method should start a thread to continuously read messages from some location\n accessible to the external process. It should yield parameters that the external process\n can use to direct its message output.\n\n Args:\n handler (PipesMessageHandler): The message handler to use to process messages read from\n the external process.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to determine\n where to write messages.\n """\n\n @abstractmethod\n def no_messages_debug_text(self) -> str:\n """A message to be displayed when no messages are received from the external process to aid with\n debugging.\n\n Example: "Attempted to read messages using a magic portal. Expected PipesMagicPortalMessageWriter\n to be explicitly passed to open_dagster_pipes in the external process."\n """
\n\n\ndef materialize_result_from_pipes_results(\n all_results: Sequence[PipesExecutionResult],\n) -> MaterializeResult:\n mat_results: List[MaterializeResult] = [\n mat_result for mat_result in all_results if isinstance(mat_result, MaterializeResult)\n ]\n check_results: List[AssetCheckResult] = [\n check_result for check_result in all_results if isinstance(check_result, AssetCheckResult)\n ]\n\n check.invariant(len(mat_results) > 0, "No materialization results received. Internal error?")\n if len(mat_results) > 1:\n raise DagsterPipesError(\n "Multiple materialize results returned with asset keys"\n f" {sorted([check.not_none(mr.asset_key).to_user_string() for mr in mat_results])}."\n " If you are materializing multiple assets in a pipes invocation, use"\n " get_results() instead.",\n )\n mat_result = next(iter(mat_results))\n for check_result in check_results:\n if check_result.asset_key:\n check.invariant(\n mat_result.asset_key == check_result.asset_key,\n "Check result specified an asset key that is not part of the returned"\n " materialization. If this was deliberate, use get_results() instead.",\n )\n\n if check_results:\n return mat_result._replace(\n check_results=[*(mat_result.check_results or []), *check_results]\n )\n else:\n return mat_result\n\n\ndef _check_result_from_pipes_results(\n all_results: Sequence[PipesExecutionResult],\n) -> AssetCheckResult:\n mat_results: List[MaterializeResult] = [\n mat_result for mat_result in all_results if isinstance(mat_result, MaterializeResult)\n ]\n check_results: List[AssetCheckResult] = [\n check_result for check_result in all_results if isinstance(check_result, AssetCheckResult)\n ]\n\n # return the single asset check result if thats what we got\n if len(mat_results) == 0 and len(check_results) == 1:\n return next(iter(check_results))\n\n # otherwise error\n raise DagsterPipesError(\n f"Did not find singular AssetCheckResult, got {len(mat_results)} MaterializeResults and"\n f" {len(check_results)} AssetCheckResults. Correct the reported results or use"\n " get_results() instead.",\n )\n
", "current_page_name": "_modules/dagster/_core/pipes/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.client"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.context

\nfrom contextlib import contextmanager\nfrom dataclasses import dataclass\nfrom queue import Queue\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, Mapping, Optional, Set, Union\n\nfrom dagster_pipes import (\n    DAGSTER_PIPES_CONTEXT_ENV_VAR,\n    DAGSTER_PIPES_MESSAGES_ENV_VAR,\n    PIPES_METADATA_TYPE_INFER,\n    PipesContextData,\n    PipesDataProvenance,\n    PipesExtras,\n    PipesMessage,\n    PipesMetadataType,\n    PipesMetadataValue,\n    PipesParams,\n    PipesTimeWindow,\n    encode_env_var,\n)\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.data_version import DataProvenance, DataVersion\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import MetadataValue, normalize_metadata_value\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.result import MaterializeResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterPipesExecutionError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.invocation import BoundOpExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.pipes.client import PipesMessageReader\n\nPipesExecutionResult: TypeAlias = Union[MaterializeResult, AssetCheckResult]\n\n\n
[docs]@experimental\nclass PipesMessageHandler:\n """Class to process :py:obj:`PipesMessage` objects received from a pipes process.\n\n Args:\n context (OpExecutionContext): The context for the executing op/asset.\n """\n\n def __init__(self, context: OpExecutionContext) -> None:\n self._context = context\n # Queue is thread-safe\n self._result_queue: Queue[PipesExecutionResult] = Queue()\n # Only read by the main thread after all messages are handled, so no need for a lock\n self._unmaterialized_assets: Set[AssetKey] = set(context.selected_asset_keys)\n self._received_any_msg = False\n self._received_closed_msg = False\n\n @contextmanager\n def handle_messages(self, message_reader: "PipesMessageReader") -> Iterator[PipesParams]:\n with message_reader.read_messages(self) as params:\n yield params\n for key in self._unmaterialized_assets:\n self._result_queue.put(MaterializeResult(asset_key=key))\n\n def clear_result_queue(self) -> Iterator[PipesExecutionResult]:\n while not self._result_queue.empty():\n yield self._result_queue.get()\n\n @property\n def received_any_message(self) -> bool:\n return self._received_any_msg\n\n @property\n def received_closed_message(self) -> bool:\n return self._received_closed_msg\n\n def _resolve_metadata(\n self, metadata: Mapping[str, PipesMetadataValue]\n ) -> Mapping[str, MetadataValue]:\n return {\n k: self._resolve_metadata_value(v["raw_value"], v["type"]) for k, v in metadata.items()\n }\n\n def _resolve_metadata_value(\n self, value: Any, metadata_type: PipesMetadataType\n ) -> MetadataValue:\n if metadata_type == PIPES_METADATA_TYPE_INFER:\n return normalize_metadata_value(value)\n elif metadata_type == "text":\n return MetadataValue.text(value)\n elif metadata_type == "url":\n return MetadataValue.url(value)\n elif metadata_type == "path":\n return MetadataValue.path(value)\n elif metadata_type == "notebook":\n return MetadataValue.notebook(value)\n elif metadata_type == "json":\n return MetadataValue.json(value)\n elif metadata_type == "md":\n return MetadataValue.md(value)\n elif metadata_type == "float":\n return MetadataValue.float(value)\n elif metadata_type == "int":\n return MetadataValue.int(value)\n elif metadata_type == "bool":\n return MetadataValue.bool(value)\n elif metadata_type == "dagster_run":\n return MetadataValue.dagster_run(value)\n elif metadata_type == "asset":\n return MetadataValue.asset(AssetKey.from_user_string(value))\n elif metadata_type == "table":\n return MetadataValue.table(value)\n elif metadata_type == "null":\n return MetadataValue.null()\n else:\n check.failed(f"Unexpected metadata type {metadata_type}")\n\n # Type ignores because we currently validate in individual handlers\n def handle_message(self, message: PipesMessage) -> None:\n if self._received_closed_msg:\n self._context.log.warn(f"[pipes] unexpected message received after closed: `{message}`")\n\n if not self._received_any_msg:\n self._received_any_msg = True\n self._context.log.info("[pipes] external process successfully opened dagster pipes.")\n\n if message["method"] == "opened":\n pass\n elif message["method"] == "closed":\n self._handle_closed()\n elif message["method"] == "report_asset_materialization":\n self._handle_report_asset_materialization(**message["params"]) # type: ignore\n elif message["method"] == "report_asset_check":\n self._handle_report_asset_check(**message["params"]) # type: ignore\n elif message["method"] == "log":\n self._handle_log(**message["params"]) # type: ignore\n else:\n raise DagsterPipesExecutionError(f"Unknown message method: {message['method']}")\n\n def _handle_closed(self) -> None:\n self._received_closed_msg = True\n\n def _handle_report_asset_materialization(\n self,\n asset_key: str,\n metadata: Optional[Mapping[str, PipesMetadataValue]],\n data_version: Optional[str],\n ) -> None:\n check.str_param(asset_key, "asset_key")\n check.opt_str_param(data_version, "data_version")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n resolved_asset_key = AssetKey.from_user_string(asset_key)\n resolved_metadata = self._resolve_metadata(metadata)\n resolved_data_version = None if data_version is None else DataVersion(data_version)\n result = MaterializeResult(\n asset_key=resolved_asset_key,\n metadata=resolved_metadata,\n data_version=resolved_data_version,\n )\n self._result_queue.put(result)\n self._unmaterialized_assets.remove(resolved_asset_key)\n\n def _handle_report_asset_check(\n self,\n asset_key: str,\n check_name: str,\n passed: bool,\n severity: str,\n metadata: Mapping[str, PipesMetadataValue],\n ) -> None:\n check.str_param(asset_key, "asset_key")\n check.str_param(check_name, "check_name")\n check.bool_param(passed, "passed")\n check.literal_param(severity, "severity", [x.value for x in AssetCheckSeverity])\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n resolved_asset_key = AssetKey.from_user_string(asset_key)\n resolved_metadata = self._resolve_metadata(metadata)\n resolved_severity = AssetCheckSeverity(severity)\n result = AssetCheckResult(\n asset_key=resolved_asset_key,\n check_name=check_name,\n passed=passed,\n severity=resolved_severity,\n metadata=resolved_metadata,\n )\n self._result_queue.put(result)\n\n def _handle_log(self, message: str, level: str = "info") -> None:\n check.str_param(message, "message")\n self._context.log.log(level, message)
\n\n\n
[docs]@experimental\n@dataclass\nclass PipesSession:\n """Object representing a pipes session.\n\n A pipes session is defined by a pair of :py:class:`PipesContextInjector` and\n :py:class:`PipesMessageReader` objects. At the opening of the session, the context injector\n writes context data to an externally accessible location, and the message reader starts\n monitoring an externally accessible location. These locations are encoded in parameters stored\n on a `PipesSession` object.\n\n During the session, an external process should be started and the parameters injected into its\n environment. The typical way to do this is to call :py:meth:`PipesSession.get_bootstrap_env_vars`\n and pass the result as environment variables.\n\n During execution, results (e.g. asset materializations) are reported by the external process and\n buffered on the `PipesSession` object. The buffer can periodically be cleared and yielded to\n Dagster machinery by calling `yield from PipesSession.get_results()`.\n\n When the external process exits, the session can be closed. Closing consists of handling any\n unprocessed messages written by the external process and cleaning up any resources used for\n context injection and message reading.\n\n Args:\n context_data (PipesContextData): The context for the executing op/asset.\n message_handler (PipesMessageHandler): The message handler to use for processing messages\n context_injector_params (PipesParams): Parameters yielded by the context injector,\n indicating the location from which the external process should load context data.\n message_reader_params (PipesParams): Parameters yielded by the message reader, indicating\n the location to which the external process should write messages.\n """\n\n context_data: PipesContextData\n message_handler: PipesMessageHandler\n context_injector_params: PipesParams\n message_reader_params: PipesParams\n\n
[docs] @public\n def get_bootstrap_env_vars(self) -> Dict[str, str]:\n """Encode context injector and message reader params as environment variables.\n\n Passing environment variables is the typical way to expose the pipes I/O parameters\n to a pipes process.\n\n Returns:\n Mapping[str, str]: Environment variables to pass to the external process. The values are\n serialized as json, compressed with gzip, and then base-64-encoded.\n """\n return {\n param_name: encode_env_var(param_value)\n for param_name, param_value in self.get_bootstrap_params().items()\n }
\n\n
[docs] @public\n def get_bootstrap_params(self) -> Dict[str, Any]:\n """Get the params necessary to bootstrap a launched pipes process. These parameters are typically\n are as environment variable. See `get_bootstrap_env_vars`. It is the context injector's\n responsibility to decide how to pass these parameters to the external environment.\n\n Returns:\n Mapping[str, str]: Parameters to pass to the external process and their corresponding\n values that must be passed by the context injector.\n """\n return {\n DAGSTER_PIPES_CONTEXT_ENV_VAR: self.context_injector_params,\n DAGSTER_PIPES_MESSAGES_ENV_VAR: self.message_reader_params,\n }
\n\n
[docs] @public\n def get_results(self) -> Iterator[PipesExecutionResult]:\n """Iterator over buffered :py:class:`PipesExecutionResult` objects received from the\n external process.\n\n When this is called it clears the results buffer.\n\n Yields:\n ExtResult: Result reported by external process.\n """\n yield from self.message_handler.clear_result_queue()
\n\n\ndef build_external_execution_context_data(\n context: OpExecutionContext,\n extras: Optional[PipesExtras],\n) -> "PipesContextData":\n asset_keys = (\n [_convert_asset_key(key) for key in sorted(context.selected_asset_keys)]\n if context.has_assets_def\n else None\n )\n code_version_by_asset_key = (\n {\n _convert_asset_key(key): context.assets_def.code_versions_by_key[key]\n for key in context.selected_asset_keys\n }\n if context.has_assets_def\n else None\n )\n provenance_by_asset_key = (\n {\n _convert_asset_key(key): _convert_data_provenance(context.get_asset_provenance(key))\n for key in context.selected_asset_keys\n }\n if context.has_assets_def\n else None\n )\n partition_key = context.partition_key if context.has_partition_key else None\n partition_key_range = context.partition_key_range if context.has_partition_key else None\n partition_time_window = (\n context.partition_time_window\n if context.has_partition_key\n and has_one_dimension_time_window_partitioning(\n context.get_step_execution_context().partitions_def\n )\n else None\n )\n return PipesContextData(\n asset_keys=asset_keys,\n code_version_by_asset_key=code_version_by_asset_key,\n provenance_by_asset_key=provenance_by_asset_key,\n partition_key=partition_key,\n partition_key_range=(\n _convert_partition_key_range(partition_key_range) if partition_key_range else None\n ),\n partition_time_window=(\n _convert_time_window(partition_time_window) if partition_time_window else None\n ),\n run_id=context.run_id,\n job_name=None if isinstance(context, BoundOpExecutionContext) else context.job_name,\n retry_number=0 if isinstance(context, BoundOpExecutionContext) else context.retry_number,\n extras=extras or {},\n )\n\n\ndef _convert_asset_key(asset_key: AssetKey) -> str:\n return asset_key.to_user_string()\n\n\ndef _convert_data_provenance(\n provenance: Optional[DataProvenance],\n) -> Optional["PipesDataProvenance"]:\n return (\n None\n if provenance is None\n else PipesDataProvenance(\n code_version=provenance.code_version,\n input_data_versions={\n _convert_asset_key(k): v.value for k, v in provenance.input_data_versions.items()\n },\n is_user_provided=provenance.is_user_provided,\n )\n )\n\n\ndef _convert_time_window(\n time_window: TimeWindow,\n) -> "PipesTimeWindow":\n return PipesTimeWindow(\n start=time_window.start.isoformat(),\n end=time_window.end.isoformat(),\n )\n\n\ndef _convert_partition_key_range(\n partition_key_range: PartitionKeyRange,\n) -> "PipesTimeWindow":\n return PipesTimeWindow(\n start=partition_key_range.start,\n end=partition_key_range.end,\n )\n
", "current_page_name": "_modules/dagster/_core/pipes/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.context"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.pipes.utils

\nimport datetime\nimport json\nimport os\nimport sys\nimport tempfile\nimport time\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom threading import Event, Thread\nfrom typing import Iterator, Optional, TextIO\n\nfrom dagster_pipes import (\n    PIPES_PROTOCOL_VERSION_FIELD,\n    PipesContextData,\n    PipesDefaultContextLoader,\n    PipesDefaultMessageWriter,\n    PipesExtras,\n    PipesParams,\n)\n\nfrom dagster import (\n    OpExecutionContext,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.pipes.client import (\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n    PipesSession,\n    build_external_execution_context_data,\n)\nfrom dagster._utils import tail_file\n\n_CONTEXT_INJECTOR_FILENAME = "context"\n_MESSAGE_READER_FILENAME = "messages"\n\n\n
[docs]@experimental\nclass PipesFileContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by writing it to a\n specified file.\n\n Args:\n path (str): The path of a file to which to write context data. The file will be deleted on\n close of the pipes session.\n """\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @contextmanager\n def inject_context(self, context_data: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to a file as JSON and exposing the\n path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with open(self._path, "w") as input_stream:\n json.dump(context_data, input_stream)\n try:\n yield {PipesDefaultContextLoader.FILE_PATH_KEY: self._path}\n finally:\n if os.path.exists(self._path):\n os.remove(self._path)\n\n def no_messages_debug_text(self) -> str:\n return f"Attempted to inject context via file {self._path}"
\n\n\n
[docs]@experimental\nclass PipesTempFileContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by writing it to an\n automatically-generated temporary file.\n """\n\n @contextmanager\n def inject_context(self, context: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to an automatically-generated\n temporary file as JSON and exposing the path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with tempfile.TemporaryDirectory() as tempdir:\n with PipesFileContextInjector(\n os.path.join(tempdir, _CONTEXT_INJECTOR_FILENAME)\n ).inject_context(context) as params:\n yield params\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to inject context via a temporary file."
\n\n\n
[docs]class PipesEnvContextInjector(PipesContextInjector):\n """Context injector that injects context data into the external process by injecting it directly into the external process environment."""\n\n @contextmanager\n def inject_context(\n self,\n context_data: "PipesContextData",\n ) -> Iterator[PipesParams]:\n """Inject context to external environment by embedding directly in the parameters that will\n be passed to the external process (typically as environment variables).\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n yield {PipesDefaultContextLoader.DIRECT_KEY: context_data}\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to inject context directly, typically as an environment variable."
\n\n\n
[docs]@experimental\nclass PipesFileMessageReader(PipesMessageReader):\n """Message reader that reads messages by tailing a specified file.\n\n Args:\n path (str): The path of the file to which messages will be written. The file will be deleted\n on close of the pipes session.\n """\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages from the external process by tailing the\n target file.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol messages.\n """\n is_task_complete = Event()\n thread = None\n try:\n open(self._path, "w").close() # create file\n thread = Thread(\n target=self._reader_thread, args=(handler, is_task_complete), daemon=True\n )\n thread.start()\n yield {PipesDefaultMessageWriter.FILE_PATH_KEY: self._path}\n finally:\n is_task_complete.set()\n if os.path.exists(self._path):\n os.remove(self._path)\n if thread:\n thread.join()\n\n def _reader_thread(self, handler: "PipesMessageHandler", is_resource_complete: Event) -> None:\n for line in tail_file(self._path, lambda: is_resource_complete.is_set()):\n message = json.loads(line)\n handler.handle_message(message)\n\n def no_messages_debug_text(self) -> str:\n return f"Attempted to read messages from file {self._path}."
\n\n\n
[docs]@experimental\nclass PipesTempFileMessageReader(PipesMessageReader):\n """Message reader that reads messages by tailing an automatically-generated temporary file."""\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages from the external process by an\n automatically-generated temporary file.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol messages.\n """\n with tempfile.TemporaryDirectory() as tempdir:\n with PipesFileMessageReader(\n os.path.join(tempdir, _MESSAGE_READER_FILENAME)\n ).read_messages(handler) as params:\n yield params\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages from a local temporary file."
\n\n\n# Number of seconds to wait after an external process has completed for stdio logs to become\n# available. If this is exceeded, proceed with exiting without picking up logs.\nWAIT_FOR_STDIO_LOGS_TIMEOUT = 60\n\n\n
[docs]@experimental\nclass PipesBlobStoreMessageReader(PipesMessageReader):\n """Message reader that reads a sequence of message chunks written by an external process into a\n blob store such as S3, Azure blob storage, or GCS.\n\n The reader maintains a counter, starting at 1, that is synchronized with a message writer in\n some pipes process. The reader starts a thread that periodically attempts to read a chunk\n indexed by the counter at some location expected to be written by the pipes process. The chunk\n should be a file with each line corresponding to a JSON-encoded pipes message. When a chunk is\n successfully read, the messages are processed and the counter is incremented. The\n :py:class:`PipesBlobStoreMessageWriter` on the other end is expected to similarly increment a\n counter (starting from 1) on successful write, keeping counters on the read and write end in\n sync.\n\n If `stdout_reader` or `stderr_reader` are passed, this reader will also start them when\n `read_messages` is called. If they are not passed, then the reader performs no stdout/stderr\n forwarding.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk\n stdout_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stdout logs.\n stderr_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stderr logs.\n """\n\n interval: float\n counter: int\n stdout_reader: "PipesBlobStoreStdioReader"\n stderr_reader: "PipesBlobStoreStdioReader"\n\n def __init__(\n self,\n interval: float = 10,\n stdout_reader: Optional["PipesBlobStoreStdioReader"] = None,\n stderr_reader: Optional["PipesBlobStoreStdioReader"] = None,\n ):\n self.interval = interval\n self.counter = 1\n self.stdout_reader = (\n check.opt_inst_param(stdout_reader, "stdout_reader", PipesBlobStoreStdioReader)\n or PipesNoOpStdioReader()\n )\n self.stderr_reader = (\n check.opt_inst_param(stderr_reader, "stderr_reader", PipesBlobStoreStdioReader)\n or PipesNoOpStdioReader()\n )\n\n @contextmanager\n def read_messages(\n self,\n handler: "PipesMessageHandler",\n ) -> Iterator[PipesParams]:\n """Set up a thread to read streaming messages by periodically reading message chunks from a\n target location.\n\n Args:\n handler (PipesMessageHandler): object to process incoming messages\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol message chunks.\n """\n with self.get_params() as params:\n is_task_complete = Event()\n messages_thread = None\n try:\n messages_thread = Thread(\n target=self._messages_thread, args=(handler, params, is_task_complete)\n )\n messages_thread.start()\n self.stdout_reader.start(params, is_task_complete)\n self.stderr_reader.start(params, is_task_complete)\n yield params\n finally:\n self.wait_for_stdio_logs(params)\n is_task_complete.set()\n if messages_thread:\n messages_thread.join()\n self.stdout_reader.stop()\n self.stderr_reader.stop()\n\n # In cases where we are forwarding logs, in some cases the logs might not be written out until\n # after the run completes. We wait for them to exist.\n def wait_for_stdio_logs(self, params):\n start_or_last_download = datetime.datetime.now()\n while (\n datetime.datetime.now() - start_or_last_download\n ).seconds <= WAIT_FOR_STDIO_LOGS_TIMEOUT and (\n (self.stdout_reader and not self.stdout_reader.is_ready(params))\n or (self.stderr_reader and not self.stderr_reader.is_ready(params))\n ):\n time.sleep(5)\n\n @abstractmethod\n @contextmanager\n def get_params(self) -> Iterator[PipesParams]:\n """Yield a set of parameters to be passed to a message writer in a pipes process.\n\n Yields:\n PipesParams: A dict of parameters that specifies where a pipes process should write\n pipes protocol message chunks.\n """\n\n @abstractmethod\n def download_messages_chunk(self, index: int, params: PipesParams) -> Optional[str]: ...\n\n def _messages_thread(\n self,\n handler: "PipesMessageHandler",\n params: PipesParams,\n is_task_complete: Event,\n ) -> None:\n start_or_last_download = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if (now - start_or_last_download).seconds > self.interval or is_task_complete.is_set():\n start_or_last_download = now\n chunk = self.download_messages_chunk(self.counter, params)\n if chunk:\n for line in chunk.split("\\n"):\n message = json.loads(line)\n handler.handle_message(message)\n self.counter += 1\n elif is_task_complete.is_set():\n break\n time.sleep(1)
\n\n\nclass PipesBlobStoreStdioReader(ABC):\n @abstractmethod\n def start(self, params: PipesParams, is_task_complete: Event) -> None: ...\n\n @abstractmethod\n def stop(self) -> None: ...\n\n @abstractmethod\n def is_ready(self, params: PipesParams) -> bool: ...\n\n\n@experimental\nclass PipesChunkedStdioReader(PipesBlobStoreStdioReader):\n """Reader for reading stdout/stderr logs from a blob store such as S3, Azure blob storage, or GCS.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk.\n target_stream (TextIO): The stream to which to write the logs. Typcially `sys.stdout` or `sys.stderr`.\n """\n\n def __init__(self, *, interval: float = 10, target_stream: TextIO):\n self.interval = interval\n self.target_stream = target_stream\n self.thread: Optional[Thread] = None\n\n @abstractmethod\n def download_log_chunk(self, params: PipesParams) -> Optional[str]: ...\n\n def start(self, params: PipesParams, is_task_complete: Event) -> None:\n self.thread = Thread(target=self._reader_thread, args=(params, is_task_complete))\n self.thread.start()\n\n def stop(self) -> None:\n if self.thread:\n self.thread.join()\n\n def _reader_thread(\n self,\n params: PipesParams,\n is_task_complete: Event,\n ) -> None:\n start_or_last_download = datetime.datetime.now()\n while True:\n now = datetime.datetime.now()\n if (\n (now - start_or_last_download).seconds > self.interval or is_task_complete.is_set()\n ) and self.is_ready(params):\n start_or_last_download = now\n chunk = self.download_log_chunk(params)\n if chunk:\n self.target_stream.write(chunk)\n elif is_task_complete.is_set():\n break\n time.sleep(self.interval)\n\n\nclass PipesNoOpStdioReader(PipesBlobStoreStdioReader):\n """Default implementation for a pipes stdio reader that does nothing."""\n\n def start(self, params: PipesParams, is_task_complete: Event) -> None:\n pass\n\n def stop(self) -> None:\n pass\n\n def is_ready(self, params: PipesParams) -> bool:\n return True\n\n\ndef extract_message_or_forward_to_stdout(handler: "PipesMessageHandler", log_line: str):\n # exceptions as control flow, you love to see it\n try:\n message = json.loads(log_line)\n if PIPES_PROTOCOL_VERSION_FIELD in message.keys():\n handler.handle_message(message)\n else:\n sys.stdout.writelines((log_line, "\\n"))\n except Exception:\n # move non-message logs in to stdout for compute log capture\n sys.stdout.writelines((log_line, "\\n"))\n\n\n_FAIL_TO_YIELD_ERROR_MESSAGE = (\n "Did you forget to `yield from pipes_session.get_results()` or `return"\n " <PipesClient>.run(...).get_results`? If using `open_pipes_session`,"\n " `pipes_session.get_results` should be called once after the `open_pipes_session` block has"\n " exited to yield any remaining buffered results via `<PipesSession>.get_results()`."\n " If using `<PipesClient>.run`, you should always return"\n " `<PipesClient>.run(...).get_results()` or `<PipesClient>.run(...).get_materialize_result()`."\n)\n\n\n
[docs]@experimental\n@contextmanager\ndef open_pipes_session(\n context: OpExecutionContext,\n context_injector: PipesContextInjector,\n message_reader: PipesMessageReader,\n extras: Optional[PipesExtras] = None,\n) -> Iterator[PipesSession]:\n """Context manager that opens and closes a pipes session.\n\n This context manager should be used to wrap the launch of an external process using the pipe\n protocol to report results back to Dagster. The yielded :py:class:`PipesSession` should be used\n to (a) obtain the environment variables that need to be provided to the external process; (b)\n access results streamed back from the external process.\n\n This method is an alternative to :py:class:`PipesClient` subclasses for users who want more\n control over how pipes processes are launched. When using `open_pipes_session`, it is the user's\n responsibility to inject the message reader and context injector parameters available on the\n yielded `PipesSession` and pass them to the appropriate API when launching the external process.\n Typically these parameters should be set as environment variables.\n\n\n Args:\n context (OpExecutionContext): The context for the current op/asset execution.\n context_injector (PipesContextInjector): The context injector to use to inject context into the external process.\n message_reader (PipesMessageReader): The message reader to use to read messages from the external process.\n extras (Optional[PipesExtras]): Optional extras to pass to the external process via the injected context.\n\n Yields:\n PipesSession: Interface for interacting with the external process.\n\n .. code-block:: python\n\n import subprocess\n from dagster import open_pipes_session\n\n extras = {"foo": "bar"}\n\n @asset\n def ext_asset(context: OpExecutionContext):\n with open_pipes_session(\n context=context,\n extras={"foo": "bar"},\n context_injector=ExtTempFileContextInjector(),\n message_reader=ExtTempFileMessageReader(),\n ) as pipes_session:\n subprocess.Popen(\n ["/bin/python", "/path/to/script.py"],\n env={**pipes_session.get_bootstrap_env_vars()}\n )\n while process.poll() is None:\n yield from pipes_session.get_results()\n\n yield from pipes_session.get_results()\n """\n context.set_requires_typed_event_stream(error_message=_FAIL_TO_YIELD_ERROR_MESSAGE)\n context_data = build_external_execution_context_data(context, extras)\n message_handler = PipesMessageHandler(context)\n try:\n with context_injector.inject_context(\n context_data\n ) as ci_params, message_handler.handle_messages(message_reader) as mr_params:\n yield PipesSession(\n context_data=context_data,\n message_handler=message_handler,\n context_injector_params=ci_params,\n message_reader_params=mr_params,\n )\n finally:\n if not message_handler.received_any_message:\n context.log.warn(\n "[pipes] did not receive any messages from external process. Check stdout / stderr"\n " logs from the external process if"\n f" possible.\\n{context_injector.__class__.__name__}:"\n f" {context_injector.no_messages_debug_text()}\\n{message_reader.__class__.__name__}:"\n f" {message_reader.no_messages_debug_text()}\\n"\n )\n elif not message_handler.received_closed_message:\n context.log.warn(\n "[pipes] did not receive closed message from external process. Buffered messages"\n " may have been discarded without being delivered. Use `open_dagster_pipes` as a"\n " context manager (a with block) to ensure that cleanup is successfully completed."\n " If that is not possible, manually call `PipesContext.close()` before process"\n " exit."\n )
\n
", "current_page_name": "_modules/dagster/_core/pipes/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.pipes.utils"}}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.default_run_coordinator

\nimport logging\nfrom typing import Mapping, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._logger = logging.getLogger("dagster.run_coordinator.default_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, object]\n ) -> Self:\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n self._instance.launch_run(dagster_run.run_id, context.workspace)\n else:\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping launch."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/default_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.queued_run_coordinator

\nimport logging\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    IntSource,\n    String,\n    _check as check,\n)\nfrom dagster._builtins import Bool\nfrom dagster._config import Array, Field, Noneable, ScalarUnion, Shape\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [\n            ("max_concurrent_runs", int),\n            ("tag_concurrency_limits", Sequence[Mapping[str, Any]]),\n            ("max_user_code_failure_retries", int),\n            ("user_code_failure_retry_delay", int),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        max_concurrent_runs: int,\n        tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]],\n        max_user_code_failure_retries: int = 0,\n        user_code_failure_retry_delay: int = 60,\n    ):\n        return super(RunQueueConfig, cls).__new__(\n            cls,\n            check.int_param(max_concurrent_runs, "max_concurrent_runs"),\n            check.opt_sequence_param(tag_concurrency_limits, "tag_concurrency_limits"),\n            check.int_param(max_user_code_failure_retries, "max_user_code_failure_retries"),\n            check.int_param(user_code_failure_retry_delay, "user_code_failure_retry_delay"),\n        )\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator[T_DagsterInstance], ConfigurableClass):\n """Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs: Optional[int] = None,\n tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]] = None,\n dequeue_interval_seconds: Optional[int] = None,\n dequeue_use_threads: Optional[bool] = None,\n dequeue_num_workers: Optional[int] = None,\n max_user_code_failure_retries: Optional[int] = None,\n user_code_failure_retry_delay: Optional[int] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data: Optional[ConfigurableClassData] = check.opt_inst_param(\n inst_data, "inst_data", ConfigurableClassData\n )\n self._max_concurrent_runs: int = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n check.invariant(\n self._max_concurrent_runs >= -1,\n "Negative values other than -1 (which disables the limit) for max_concurrent_runs"\n " are disallowed.",\n )\n self._tag_concurrency_limits: Sequence[Mapping[str, Any]] = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds: int = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n self._dequeue_use_threads: bool = check.opt_bool_param(\n dequeue_use_threads, "dequeue_use_threads", False\n )\n self._dequeue_num_workers: Optional[int] = check.opt_int_param(\n dequeue_num_workers, "dequeue_num_workers"\n )\n self._max_user_code_failure_retries: int = check.opt_int_param(\n max_user_code_failure_retries, "max_user_code_failure_retries", 0\n )\n self._user_code_failure_retry_delay: int = check.opt_int_param(\n user_code_failure_retry_delay, "user_code_failure_retry_delay", 60\n )\n self._logger = logging.getLogger("dagster.run_coordinator.queued_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_run_queue_config(self) -> RunQueueConfig:\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n max_user_code_failure_retries=self._max_user_code_failure_retries,\n user_code_failure_retry_delay=self._user_code_failure_retry_delay,\n )\n\n @property\n def dequeue_interval_seconds(self) -> int:\n return self._dequeue_interval_seconds\n\n @property\n def dequeue_use_threads(self) -> bool:\n return self._dequeue_use_threads\n\n @property\n def dequeue_num_workers(self) -> Optional[int]:\n return self._dequeue_num_workers\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The maximum number of runs that are allowed to be in progress at once."\n " Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs"\n " from launching. Any other negative values are disallowed."\n ),\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description=(\n "A set of limits that are applied to runs with particular tags. If a value is"\n " set, the limit is applied to only that key-value pair. If no value is set,"\n " the limit is applied across all values of that key. If the value is set to a"\n " dict with `applyLimitPerUniqueValue: true`, the limit will apply to the"\n " number of unique values for that key."\n ),\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch."\n ),\n ),\n "dequeue_use_threads": Field(\n config=bool,\n is_required=False,\n description=(\n "Whether or not to use threads for concurrency when launching dequeued runs."\n ),\n ),\n "dequeue_num_workers": Field(\n config=IntSource,\n is_required=False,\n description=(\n "If dequeue_use_threads is true, limit the number of concurrent worker threads."\n ),\n ),\n "max_user_code_failure_retries": Field(\n config=IntSource,\n is_required=False,\n default_value=0,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how many times to retry the dequeue before failing it. The only run launcher"\n " that requires the gRPC server to be running is the DefaultRunLauncher, so"\n " setting this will have no effect unless that run launcher is being used."\n ),\n ),\n "user_code_failure_retry_delay": Field(\n config=IntSource,\n is_required=False,\n default_value=60,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how long to wait before retrying any runs from that same code location. The"\n " only run launcher that requires the gRPC server to be running is the"\n " DefaultRunLauncher, so setting this will have no effect unless that run"\n " launcher is being used."\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n dequeue_use_threads=config_value.get("dequeue_use_threads"),\n dequeue_num_workers=config_value.get("dequeue_num_workers"),\n max_user_code_failure_retries=config_value.get("max_user_code_failure_retries"),\n user_code_failure_retry_delay=config_value.get("user_code_failure_retry_delay"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n job_name=dagster_run.job_name,\n )\n self._instance.report_dagster_event(enqueued_event, run_id=dagster_run.run_id)\n else:\n # the run was already submitted, this is a no-op\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping enqueue."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == DagsterRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/queued_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config import Field, IntSource\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterError\nfrom dagster._core.host_representation import ExternalSchedule\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors."""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", Sequence[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", Sequence[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: Sequence[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: Sequence[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.sequence_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.sequence_param(\n                schedule_storage, "schedule_storage", of_type=str\n            ),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(\n self, instance: DagsterInstance, external_schedule: ExternalSchedule\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not stored_state:\n started_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_state)\n else:\n started_state = stored_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_state)\n return started_state\n\n def stop_schedule(\n self,\n instance: DagsterInstance,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional[ExternalSchedule],\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(schedule_origin_id, schedule_selector_id)\n\n if not external_schedule:\n computed_state = stored_state\n else:\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n\n if computed_state and not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_schedule\n stopped_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_state)\n else:\n stopped_state = stored_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=computed_state.instigator_data.cron_schedule, # type: ignore\n )\n )\n instance.update_instigator_state(stopped_state)\n\n return stopped_state\n\n @abc.abstractmethod\n def debug_info(self) -> str:\n """Returns debug information about the scheduler."""\n\n @abc.abstractmethod\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n """Get path to store logs for schedule.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self,\n max_catchup_runs: int = DEFAULT_MAX_CATCHUP_RUNS,\n max_tick_retries: int = 0,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description=(\n "For each schedule tick that raises an error, how many times to retry that tick"\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self) -> str:\n return ""\n\n def wipe(self, instance: DagsterInstance) -> None:\n pass\n\n def _get_or_create_logs_directory(\n self, instance: DagsterInstance, schedule_origin_id: str\n ) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/_core/scheduler/scheduler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.scheduler.scheduler"}}, "storage": {"asset_value_loader": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.asset_value_loader

\nfrom contextlib import ExitStack\nfrom typing import Any, Dict, Mapping, Optional, Type, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.job_definition import (\n    default_job_io_manager_with_fs_io_manager_schema,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.execution.build_resources import build_resources, get_mapped_resource_config\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.output import build_output_context\nfrom dagster._core.execution.resources_init import get_transitive_required_resource_keys\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.config import is_dagster_home_set\nfrom dagster._core.types.dagster_type import resolve_dagster_type\nfrom dagster._utils.merger import merge_dicts\n\nfrom .io_manager import IOManager\n\n\n
[docs]class AssetValueLoader:\n """Caches resource definitions that are used to load asset values across multiple load\n invocations.\n\n Should not be instantiated directly. Instead, use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`.\n """\n\n def __init__(\n self,\n assets_defs_by_key: Mapping[AssetKey, AssetsDefinition],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n instance: Optional[DagsterInstance] = None,\n ):\n self._assets_defs_by_key = assets_defs_by_key\n self._source_assets_by_key = source_assets_by_key\n self._resource_instance_cache: Dict[str, object] = {}\n self._exit_stack: ExitStack = ExitStack().__enter__()\n if not instance and is_dagster_home_set():\n self._instance = self._exit_stack.enter_context(DagsterInstance.get())\n else:\n self._instance = instance\n\n def _ensure_resource_instances_in_cache(\n self,\n resource_defs: Mapping[str, ResourceDefinition],\n resource_config: Optional[Mapping[str, Any]] = None,\n ):\n for built_resource_key, built_resource in (\n self._exit_stack.enter_context(\n build_resources(\n resources={\n resource_key: self._resource_instance_cache.get(resource_key, resource_def)\n for resource_key, resource_def in resource_defs.items()\n },\n instance=self._instance,\n resource_config=resource_config,\n )\n )\n ._asdict()\n .items()\n ):\n self._resource_instance_cache[built_resource_key] = built_resource\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type[object]] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n ) -> object:\n """Loads the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n asset_key = AssetKey.from_coercible(asset_key)\n resource_config = resource_config or {}\n output_metadata = {}\n\n if asset_key in self._assets_defs_by_key:\n assets_def = self._assets_defs_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n assets_def.resource_defs,\n )\n io_manager_key = assets_def.get_io_manager_key_for_asset_key(asset_key)\n io_manager_def = resource_defs[io_manager_key]\n name = assets_def.get_output_name_for_asset_key(asset_key)\n output_metadata = assets_def.metadata_by_key[asset_key]\n op_def = assets_def.get_op_def_for_asset_key(asset_key)\n asset_partitions_def = assets_def.partitions_def\n elif asset_key in self._source_assets_by_key:\n source_asset = self._source_assets_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n source_asset.resource_defs,\n )\n io_manager_key = source_asset.get_io_manager_key()\n io_manager_def = resource_defs[io_manager_key]\n name = asset_key.path[-1]\n output_metadata = source_asset.raw_metadata\n op_def = None\n asset_partitions_def = source_asset.partitions_def\n else:\n check.failed(f"Asset key {asset_key} not found")\n\n required_resource_keys = get_transitive_required_resource_keys(\n io_manager_def.required_resource_keys, resource_defs\n ) | {io_manager_key}\n\n self._ensure_resource_instances_in_cache(\n {k: v for k, v in resource_defs.items() if k in required_resource_keys},\n resource_config=resource_config,\n )\n io_manager = cast(IOManager, self._resource_instance_cache[io_manager_key])\n\n io_config = resource_config.get(io_manager_key)\n io_resource_config = {io_manager_key: io_config} if io_config else {}\n\n io_manager_config = get_mapped_resource_config(\n {io_manager_key: io_manager_def}, io_resource_config\n )\n\n input_context = build_input_context(\n name=None,\n asset_key=asset_key,\n dagster_type=resolve_dagster_type(python_type),\n upstream_output=build_output_context(\n name=name,\n metadata=output_metadata,\n asset_key=asset_key,\n op_def=op_def,\n resource_config=resource_config,\n ),\n resources=self._resource_instance_cache,\n resource_config=io_manager_config[io_manager_key].config,\n partition_key=partition_key,\n asset_partition_key_range=(\n PartitionKeyRange(partition_key, partition_key)\n if partition_key is not None\n else None\n ),\n asset_partitions_def=asset_partitions_def,\n instance=self._instance,\n metadata=metadata,\n )\n\n return io_manager.load_input(input_context)
\n\n def __enter__(self):\n return self\n\n def __exit__(self, *exc):\n self._exit_stack.close()
\n
", "current_page_name": "_modules/dagster/_core/storage/asset_value_loader", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.asset_value_loader"}, "base_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.base_storage

\nfrom abc import ABC, abstractmethod\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\n\nfrom .event_log.base import EventLogStorage\nfrom .runs.base import RunStorage\nfrom .schedules.base import ScheduleStorage\n\n\n
[docs]class DagsterStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for Dagster persistent storage, for reading and writing data for runs,\n events, and schedule/sensor state.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-daemon`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @property\n @abstractmethod\n def event_log_storage(self) -> EventLogStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def run_storage(self) -> RunStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def schedule_storage(self) -> ScheduleStorage[T_DagsterInstance]:\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/base_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.base_storage"}, "captured_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.captured_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import IO, Callable, Generator, Iterator, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Final, Self\n\nimport dagster._check as check\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\n\nMAX_BYTES_CHUNK_READ: Final = 4194304  # 4 MB\n\n\nclass CapturedLogContext(\n    NamedTuple(\n        "_CapturedLogContext",\n        [\n            ("log_key", Sequence[str]),\n            ("external_url", Optional[str]),\n            ("external_stdout_url", Optional[str]),\n            ("external_stderr_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing the context in which logs are captured.  Can be used by external logging\n    sidecar implementations to point the Dagster UI to an external url to view compute logs instead of a\n    Dagster-managed location.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        external_stdout_url: Optional[str] = None,\n        external_stderr_url: Optional[str] = None,\n        external_url: Optional[str] = None,\n    ):\n        if external_url and (external_stdout_url or external_stderr_url):\n            check.failed(\n                "Cannot specify both `external_url` and one of"\n                " `external_stdout_url`/`external_stderr_url`"\n            )\n\n        return super(CapturedLogContext, cls).__new__(\n            cls,\n            log_key,\n            external_stdout_url=external_stdout_url,\n            external_stderr_url=external_stderr_url,\n            external_url=external_url,\n        )\n\n\nclass CapturedLogData(\n    NamedTuple(\n        "_CapturedLogData",\n        [\n            ("log_key", Sequence[str]),\n            ("stdout", Optional[bytes]),\n            ("stderr", Optional[bytes]),\n            ("cursor", Optional[str]),\n        ],\n    )\n):\n    """Object representing captured log data, either a partial chunk of the log data or the full\n    capture.  Contains the raw bytes and optionally the cursor offset for the partial chunk.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        stdout: Optional[bytes] = None,\n        stderr: Optional[bytes] = None,\n        cursor: Optional[str] = None,\n    ):\n        return super(CapturedLogData, cls).__new__(cls, log_key, stdout, stderr, cursor)\n\n\nclass CapturedLogMetadata(\n    NamedTuple(\n        "_CapturedLogMetadata",\n        [\n            ("stdout_location", Optional[str]),\n            ("stderr_location", Optional[str]),\n            ("stdout_download_url", Optional[str]),\n            ("stderr_download_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing metadata info for the captured log data, containing a display string for\n    the location of the log data and a URL for direct download of the captured log data.\n    """\n\n    def __new__(\n        cls,\n        stdout_location: Optional[str] = None,\n        stderr_location: Optional[str] = None,\n        stdout_download_url: Optional[str] = None,\n        stderr_download_url: Optional[str] = None,\n    ):\n        return super(CapturedLogMetadata, cls).__new__(\n            cls,\n            stdout_location=stdout_location,\n            stderr_location=stderr_location,\n            stdout_download_url=stdout_download_url,\n            stderr_download_url=stderr_download_url,\n        )\n\n\nclass CapturedLogSubscription:\n    def __init__(\n        self, manager: "CapturedLogManager", log_key: Sequence[str], cursor: Optional[str]\n    ):\n        self._manager = manager\n        self._log_key = log_key\n        self._cursor = cursor\n        self._observer: Optional[Callable[[CapturedLogData], None]] = None\n        self.is_complete = False\n\n    def __call__(self, observer: Optional[Callable[[CapturedLogData], None]]) -> Self:\n        self._observer = observer\n        self.fetch()\n        if self._manager.is_capture_complete(self._log_key):\n            self.complete()\n        return self\n\n    @property\n    def log_key(self) -> Sequence[str]:\n        return self._log_key\n\n    def dispose(self) -> None:\n        self._observer = None\n        self._manager.unsubscribe(self)\n\n    def fetch(self) -> None:\n        if not self._observer:\n            return\n\n        should_fetch = True\n        while should_fetch:\n            log_data = self._manager.get_log_data(\n                self._log_key,\n                self._cursor,\n                max_bytes=MAX_BYTES_CHUNK_READ,\n            )\n            if not self._cursor or log_data.cursor != self._cursor:\n                self._observer(log_data)\n                self._cursor = log_data.cursor\n            should_fetch = _has_max_data(log_data.stdout) or _has_max_data(log_data.stderr)\n\n    def complete(self) -> None:\n        self.is_complete = True\n\n\ndef _has_max_data(chunk: Optional[bytes]) -> bool:\n    # function is used as predicate but does not actually return a boolean\n    return chunk and len(chunk) >= MAX_BYTES_CHUNK_READ  # type: ignore\n\n\n
[docs]class CapturedLogManager(ABC):\n """Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\n process, stored / retrieved with a provided log_key.\n """\n\n @abstractmethod\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n """Context manager for capturing the stdout/stderr within the current process, and persisting\n it under the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO[bytes]]]:\n """Context manager for providing an IO stream that enables the caller to write to a log stream\n managed by the captured log manager, to be read later using the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n """Flag indicating when the log capture for a given log key has completed.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n """Returns a chunk of the captured stdout logs for a given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[str]): A cursor representing the position of the log chunk to fetch\n max_bytes (Optional[int]): A limit on the size of the log chunk to fetch\n\n Returns:\n CapturedLogData\n """\n\n @abstractmethod\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n """Returns the metadata of the captured logs for a given log key, including\n displayable information on where the logs are persisted.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n CapturedLogMetadata\n """\n\n @abstractmethod\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ) -> None:\n """Deletes the captured logs for a given log key.\n\n Args:\n log_key(Optional[List[String]]): The log key of the logs to delete\n prefix(Optional[List[String]]): The prefix of the log keys to delete\n """\n\n @abstractmethod\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n """Registers an observable object for log data.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[String]): The string cursor marking the position within the log stream\n Returns:\n ComputeLogSubscription\n """\n\n @abstractmethod\n def unsubscribe(self, subscription: CapturedLogSubscription) -> None:\n """Deregisters an observable object from receiving log updates.\n\n Args:\n subscription (CapturedLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def build_log_key_for_run(self, run_id: str, step_key: str) -> Sequence[str]:\n """Legacy adapter to translate run_id/key to captured log manager-based log_key."""\n return [run_id, "compute_logs", step_key]
\n
", "current_page_name": "_modules/dagster/_core/storage/captured_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.captured_log_manager"}, "compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import Callable, Iterator, NamedTuple, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data."""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids.\n """\n\n @contextmanager\n def watch(self, dagster_run: DagsterRun, step_key: Optional[str] = None) -> Iterator[None]:\n """Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(dagster_run, step_key):\n yield\n return\n\n self.on_watch_start(dagster_run, step_key)\n with self._watch_logs(dagster_run, step_key):\n yield\n self.on_watch_finish(dagster_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n """Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n ...\n\n @abstractmethod\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _dagster_run: DagsterRun, _step_key: Optional[str]) -> bool:\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription: "ComputeLogSubscription") -> None:\n """Hook for managing streaming subscriptions for log data from `dagster-webserver`.\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription: "ComputeLogSubscription") -> None:\n pass\n\n def observable(\n self, run_id: str, key: str, io_type: ComputeIOType, cursor: Optional[str] = None\n ) -> "ComputeLogSubscription":\n """Return a ComputeLogSubscription which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor) # type: ignore # (var reassigned diff type)\n else:\n cursor = 0 # type: ignore # (var reassigned diff type)\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor) # type: ignore # (var reassigned diff type)\n self.on_subscribe(subscription)\n return subscription\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written.\n """\n\n def __init__(\n self,\n manager: ComputeLogManager,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int,\n ):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer: Optional[Callable[[ComputeLogFileData], None]] = None\n self.is_complete = False\n\n def __call__(self, observer: Callable[[ComputeLogFileData], None]) -> Self:\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self) -> None:\n # called when the connection gets closed, allowing the observer to get GC'ed\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self) -> None:\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self) -> None:\n self.is_complete = True\n if not self.observer:\n return\n
", "current_page_name": "_modules/dagster/_core/storage/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.compute_log_manager"}, "dagster_run": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.dagster_run

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    REPOSITORY_LABEL_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.external import ExternalSchedule, ExternalSensor\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\n
[docs]@whitelist_for_serdes(storage_name="PipelineRunStatus")\nclass DagsterRunStatus(Enum):\n """The status of run execution."""\n\n # Runs waiting to be launched by the Dagster Daemon.\n QUEUED = "QUEUED"\n\n # Runs that have been launched, but execution has not yet started."""\n NOT_STARTED = "NOT_STARTED"\n\n # Runs that are managed outside of the Dagster control plane.\n MANAGED = "MANAGED"\n\n # Runs that have been launched, but execution has not yet started.\n STARTING = "STARTING"\n\n # Runs that have been launched and execution has started.\n STARTED = "STARTED"\n\n # Runs that have successfully completed.\n SUCCESS = "SUCCESS"\n\n # Runs that have failed to complete.\n FAILURE = "FAILURE"\n\n # Runs that are in-progress and pending to be canceled.\n CANCELING = "CANCELING"\n\n # Runs that have been canceled before completion.\n CANCELED = "CANCELED"
\n\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.STARTING,\n DagsterRunStatus.STARTED,\n DagsterRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.QUEUED,\n DagsterRunStatus.NOT_STARTED,\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.MANAGED,\n DagsterRunStatus.CANCELED,\n]\n\nFINISHED_STATUSES = [\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.CANCELED,\n]\n\n# Run statuses for runs that can be safely canceled.\n# Does not include the other unfinished statuses for the following reasons:\n# STARTING: Control has been ceded to the run worker, which will eventually move the run to a STARTED.\n# NOT_STARTED: Mostly replaced with STARTING. Runs are only here in the the brief window between\n# creating the run and launching or enqueueing it.\nCANCELABLE_RUN_STATUSES = [DagsterRunStatus.STARTED, DagsterRunStatus.QUEUED]\n\n\n@whitelist_for_serdes(storage_name="PipelineRunStatsSnapshot")\nclass DagsterRunStatsSnapshot(\n NamedTuple(\n "_DagsterRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(DagsterRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(NamedTupleSerializer["DagsterRun"]):\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n # * added asset_selection\n # * added has_repository_load_data\n def before_unpack(self, context, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n # back compat for environment dict => run_config\n if "environment_dict" in unpacked_dict:\n check.invariant(\n unpacked_dict.get("run_config") is None,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n unpacked_dict["run_config"] = unpacked_dict["environment_dict"]\n del unpacked_dict["environment_dict"]\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if "previous_run_id" in unpacked_dict and not (\n "parent_run_id" in unpacked_dict and "root_run_id" in unpacked_dict\n ):\n unpacked_dict["parent_run_id"] = unpacked_dict["previous_run_id"]\n unpacked_dict["root_run_id"] = unpacked_dict["previous_run_id"]\n del unpacked_dict["previous_run_id"]\n\n # back compat for selector => pipeline_name, solids_to_execute\n if "selector" in unpacked_dict:\n selector = unpacked_dict["selector"]\n\n if not isinstance(selector, ExecutionSelector):\n check.failed(f"unexpected entry for 'select', {selector}")\n selector_name = selector.name\n selector_subset = selector.solid_subset\n\n job_name = unpacked_dict.get("pipeline_name")\n check.invariant(\n job_name is None or selector_name == job_name,\n f"Conflicting pipeline name {job_name} in arguments to PipelineRun: "\n f"selector was passed with pipeline {selector_name}",\n )\n if job_name is None:\n unpacked_dict["pipeline_name"] = selector_name\n\n solids_to_execute = unpacked_dict.get("solids_to_execute")\n check.invariant(\n solids_to_execute is None\n or (selector_subset and set(selector_subset) == solids_to_execute),\n f"Conflicting solids_to_execute {solids_to_execute} in arguments to"\n f" PipelineRun: selector was passed with subset {selector_subset}",\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector_subset) if selector_subset else None\n\n # back compat for solid_subset => solids_to_execute\n if "solid_subset" in unpacked_dict:\n unpacked_dict["solids_to_execute"] = unpacked_dict["solid_subset"]\n del unpacked_dict["solid_subset"]\n\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterRunSerializer,\n # DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version\n # of Dagster, but is read back in as a DagsterRun.\n storage_name="PipelineRun",\n old_fields={"mode": None},\n storage_field_names={\n "job_name": "pipeline_name",\n "job_snapshot_id": "pipeline_snapshot_id",\n "external_job_origin": "external_pipeline_origin",\n "job_code_origin": "pipeline_code_origin",\n "op_selection": "solid_selection",\n "resolved_op_selection": "solids_to_execute",\n },\n)\nclass DagsterRun(\n NamedTuple(\n "_DagsterRun",\n [\n ("job_name", PublicAttr[str]),\n ("run_id", str),\n ("run_config", Mapping[str, object]),\n ("asset_selection", Optional[AbstractSet[AssetKey]]),\n ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n ("op_selection", Optional[Sequence[str]]),\n ("resolved_op_selection", Optional[AbstractSet[str]]),\n ("step_keys_to_execute", Optional[Sequence[str]]),\n ("status", DagsterRunStatus),\n ("tags", Mapping[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("job_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_job_origin", Optional["ExternalJobOrigin"]),\n ("job_code_origin", Optional[JobPythonOrigin]),\n ("has_repository_load_data", bool),\n ],\n )\n):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n job_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n op_selection: Optional[Sequence[str]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n status: Optional[DagsterRunStatus] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n job_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n has_repository_load_data: Optional[bool] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group",\n )\n # a set which contains the names of the ops to execute\n resolved_op_selection = check.opt_nullable_set_param(\n resolved_op_selection, "resolved_op_selection", of_type=str\n )\n # a list of op queries provided by the user\n # possible to be None when resolved_op_selection is set by the user directly\n op_selection = check.opt_nullable_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n asset_selection = check.opt_nullable_set_param(\n asset_selection, "asset_selection", of_type=AssetKey\n )\n asset_check_selection = check.opt_nullable_set_param(\n asset_check_selection, "asset_check_selection", of_type=AssetCheckKey\n )\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n if status == DagsterRunStatus.QUEUED:\n check.inst_param(\n external_job_origin,\n "external_job_origin",\n ExternalJobOrigin,\n "external_job_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(DagsterRun, cls).__new__(\n cls,\n job_name=check.str_param(job_name, "job_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_mapping_param(run_config, "run_config", key_type=str),\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", DagsterRunStatus, DagsterRunStatus.NOT_STARTED\n ),\n tags=check.opt_mapping_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n job_snapshot_id=check.opt_str_param(job_snapshot_id, "job_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_job_origin=check.opt_inst_param(\n external_job_origin, "external_job_origin", ExternalJobOrigin\n ),\n job_code_origin=check.opt_inst_param(\n job_code_origin, "job_code_origin", JobPythonOrigin\n ),\n has_repository_load_data=check.opt_bool_param(\n has_repository_load_data, "has_repository_load_data", default=False\n ),\n )\n\n def with_status(self, status: DagsterRunStatus) -> Self:\n if status == DagsterRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst(\n self.external_job_origin,\n ExternalJobOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_job_origin(self, origin: "ExternalJobOrigin") -> Self:\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst_param(origin, "origin", ExternalJobOrigin)\n return self._replace(external_job_origin=origin)\n\n def with_tags(self, tags: Mapping[str, str]) -> Self:\n return self._replace(tags=tags)\n\n def get_root_run_id(self) -> Optional[str]:\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self) -> Optional[str]:\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n def tags_for_storage(self) -> Mapping[str, str]:\n repository_tags = {}\n if self.external_job_origin:\n # tag the run with a label containing the repository name / location name, to allow for\n # per-repository filtering of runs from the Dagster UI.\n repository_tags[REPOSITORY_LABEL_TAG] = (\n self.external_job_origin.external_repository_origin.get_label()\n )\n\n if not self.tags:\n return repository_tags\n\n return {**repository_tags, **self.tags}\n\n @public\n @property\n def is_finished(self) -> bool:\n """bool: If this run has completely finished execution."""\n return self.status in FINISHED_STATUSES\n\n @public\n @property\n def is_success(self) -> bool:\n """bool: If this run has successfully finished executing."""\n return self.status == DagsterRunStatus.SUCCESS\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this run has failed."""\n return self.status == DagsterRunStatus.FAILURE\n\n @public\n @property\n def is_failure_or_canceled(self) -> bool:\n """bool: If this run has either failed or was canceled."""\n return self.status == DagsterRunStatus.FAILURE or self.status == DagsterRunStatus.CANCELED\n\n @public\n @property\n def is_resume_retry(self) -> bool:\n """bool: If this run was created from retrying another run from the point of failure."""\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self) -> Optional[str]:\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule) -> Mapping[str, str]:\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor) -> Mapping[str, str]:\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id: str) -> Mapping[str, str]:\n return {BACKFILL_ID_TAG: backfill_id}
\n\n\nclass RunsFilterSerializer(NamedTupleSerializer["RunsFilter"]):\n def before_unpack(\n self,\n context,\n unpacked_dict: Dict[str, Any],\n ) -> Dict[str, Any]:\n # We store empty run ids as [] but only accept None\n if "run_ids" in unpacked_dict and unpacked_dict["run_ids"] == []:\n unpacked_dict["run_ids"] = None\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=RunsFilterSerializer,\n old_storage_names={"PipelineRunsFilter"},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", Sequence[str]),\n ("job_name", Optional[str]),\n ("statuses", Sequence[DagsterRunStatus]),\n ("tags", Mapping[str, Union[str, Sequence[str]]]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("updated_before", Optional[datetime]),\n ("created_after", Optional[datetime]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n """Defines a filter across job runs, for use when querying storage directly.\n\n Each field of the RunsFilter represents a logical AND with each other. For\n example, if you specify job_name and tags, then you will receive only runs\n with the specified job_name AND the specified tags. If left blank, then\n all values will be permitted for that field.\n\n Args:\n run_ids (Optional[List[str]]): A list of job run_id values.\n job_name (Optional[str]):\n Name of the job to query for. If blank, all job_names will be accepted.\n statuses (Optional[List[DagsterRunStatus]]):\n A list of run statuses to filter by. If blank, all run statuses will be allowed.\n tags (Optional[Dict[str, Union[str, List[str]]]]):\n A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.\n snapshot_id (Optional[str]): The ID of the job snapshot to query for. Intended for internal use.\n updated_after (Optional[DateTime]): Filter by runs that were last updated before this datetime.\n created_before (Optional[DateTime]): Filter by runs that were created before this datetime.\n\n """\n\n def __new__(\n cls,\n run_ids: Optional[Sequence[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[Sequence[DagsterRunStatus]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n updated_before: Optional[datetime] = None,\n created_after: Optional[datetime] = None,\n created_before: Optional[datetime] = None,\n ):\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_sequence_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_sequence_param(statuses, "statuses", of_type=DagsterRunStatus),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n updated_before=check.opt_inst_param(updated_before, "updated_before", datetime),\n created_after=check.opt_inst_param(created_after, "created_after", datetime),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @staticmethod\n def for_schedule(schedule: "ExternalSchedule") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_sensor(sensor: "ExternalSensor") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id: str) -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_backfill_id(backfill_id))
\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\n
[docs]class RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("dagster_run", DagsterRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n\n Users should not invoke this class directly.\n """\n\n def __new__(\n cls,\n storage_id: int,\n dagster_run: DagsterRun,\n create_timestamp: datetime,\n update_timestamp: datetime,\n start_time: Optional[float] = None,\n end_time: Optional[float] = None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )
\n\n\n@whitelist_for_serdes\nclass RunPartitionData(\n NamedTuple(\n "_RunPartitionData",\n [\n ("run_id", str),\n ("partition", str),\n ("status", DagsterRunStatus),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n partition: str,\n status: DagsterRunStatus,\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(RunPartitionData, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n partition=check.str_param(partition, "partition"),\n status=check.inst_param(status, "status", DagsterRunStatus),\n start_time=check.opt_inst(start_time, float),\n end_time=check.opt_inst(end_time, float),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[Sequence[str]])])\n):\n """Kept here to maintain loading of PipelineRuns from when it was still alive."""\n\n def __new__(cls, name: str, solid_subset: Optional[Sequence[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=(\n None\n if solid_subset is None\n else check.sequence_param(solid_subset, "solid_subset", of_type=str)\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/storage/dagster_run", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.dagster_run"}, "event_log": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.base

\nimport base64\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.event_api import EventHandlerFn, EventLogRecord, EventRecordsFilter\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.asset_check_execution_record import AssetCheckExecutionRecord\nfrom dagster._core.storage.dagster_run import DagsterRunStatsSnapshot\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._seven import json\nfrom dagster._utils import PrintFn\nfrom dagster._utils.concurrency import ConcurrencyClaimStatus, ConcurrencyKeyInfo\n\nif TYPE_CHECKING:\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n\nclass EventLogConnection(NamedTuple):\n    records: Sequence[EventLogRecord]\n    cursor: str\n    has_more: bool\n\n\nclass EventLogCursorType(Enum):\n    OFFSET = "OFFSET"\n    STORAGE_ID = "STORAGE_ID"\n\n\nclass EventLogCursor(NamedTuple):\n    """Representation of an event record cursor, keeping track of the log query state."""\n\n    cursor_type: EventLogCursorType\n    value: int\n\n    def is_offset_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.OFFSET\n\n    def is_id_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.STORAGE_ID\n\n    def offset(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.OFFSET)\n        return max(0, int(self.value))\n\n    def storage_id(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.STORAGE_ID)\n        return int(self.value)\n\n    def __str__(self) -> str:\n        return self.to_string()\n\n    def to_string(self) -> str:\n        raw = json.dumps({"type": self.cursor_type.value, "value": self.value})\n        return base64.b64encode(bytes(raw, encoding="utf-8")).decode("utf-8")\n\n    @staticmethod\n    def parse(cursor_str: str) -> "EventLogCursor":\n        raw = json.loads(base64.b64decode(cursor_str).decode("utf-8"))\n        return EventLogCursor(EventLogCursorType(raw["type"]), raw["value"])\n\n    @staticmethod\n    def from_offset(offset: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.OFFSET, offset)\n\n    @staticmethod\n    def from_storage_id(storage_id: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.STORAGE_ID, storage_id)\n\n\nclass AssetEntry(\n    NamedTuple(\n        "_AssetEntry",\n        [\n            ("asset_key", AssetKey),\n            ("last_materialization_record", Optional[EventLogRecord]),\n            ("last_run_id", Optional[str]),\n            ("asset_details", Optional[AssetDetails]),\n            ("cached_status", Optional["AssetStatusCacheValue"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        asset_key: AssetKey,\n        last_materialization_record: Optional[EventLogRecord] = None,\n        last_run_id: Optional[str] = None,\n        asset_details: Optional[AssetDetails] = None,\n        cached_status: Optional["AssetStatusCacheValue"] = None,\n    ):\n        from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n        return super(AssetEntry, cls).__new__(\n            cls,\n            asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n            last_materialization_record=check.opt_inst_param(\n                last_materialization_record, "last_materialization_record", EventLogRecord\n            ),\n            last_run_id=check.opt_str_param(last_run_id, "last_run_id"),\n            asset_details=check.opt_inst_param(asset_details, "asset_details", AssetDetails),\n            cached_status=check.opt_inst_param(\n                cached_status, "cached_status", AssetStatusCacheValue\n            ),\n        )\n\n    @property\n    def last_materialization(self) -> Optional["EventLogEntry"]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.event_log_entry\n\n    @property\n    def last_materialization_storage_id(self) -> Optional[int]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.storage_id\n\n\n
[docs]class AssetRecord(NamedTuple):\n """Internal representation of an asset record, as stored in a :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not invoke this class directly.\n """\n\n storage_id: int\n asset_entry: AssetEntry
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[Union[str, int]] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> Sequence["EventLogEntry"]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[Union[str, int]]): Cursor value to track paginated queries. Legacy\n support for integer offset cursors.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n if isinstance(cursor, int):\n cursor = EventLogCursor.from_offset(cursor + 1).to_string()\n records = self.get_records_for_run(\n run_id, cursor, of_type, limit, ascending=ascending\n ).records\n return [record.event_log_entry for record in records]\n\n @abstractmethod\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the event log records corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[str]): Cursor value to track paginated queries.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: "EventLogEntry") -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str) -> None:\n """Remove events for a given run id."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self) -> None:\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n pass\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, "EventLogEntry"]:\n """Get event records across all runs. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n def get_maximum_record_id(self) -> Optional[int]:\n """Get the current greatest record id in the event log. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n @abstractmethod\n def can_cache_asset_status_data(self) -> bool:\n pass\n\n @abstractmethod\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n pass\n\n @abstractmethod\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Sequence[AssetKey]:\n pass\n\n @abstractmethod\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n pass\n\n def supports_add_asset_event_tags(self) -> bool:\n return False\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n raise NotImplementedError()\n\n @abstractmethod\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey) -> None:\n """Remove asset index history from event log for given asset_key."""\n\n @abstractmethod\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n pass\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass\n\n @abstractmethod\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n pass\n\n @abstractmethod\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n pass\n\n @abstractmethod\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n pass\n\n @abstractmethod\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a dynamic partition exists."""\n raise NotImplementedError()\n\n @abstractmethod\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @property\n def is_run_sharded(self) -> bool:\n """Indicates that the EventLogStoarge is sharded."""\n return False\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n """Indicates that the EventLogStorage supports global concurrency limits."""\n return False\n\n @abstractmethod\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate concurrency slots for the given concurrency key."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get concurrency info for key."""\n raise NotImplementedError()\n\n @abstractmethod\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_run_ids(self) -> Set[str]:\n """Get a list of run_ids that are occupying or waiting for a concurrency key slot."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n """Frees concurrency slots for a given run."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n """Frees concurrency slots for a given run/step."""\n raise NotImplementedError()\n\n @property\n def supports_asset_checks(self):\n return True\n\n @abstractmethod\n def get_asset_check_execution_history(\n self,\n check_key: AssetCheckKey,\n limit: int,\n cursor: Optional[int] = None,\n ) -> Sequence[AssetCheckExecutionRecord]:\n """Get executions for one asset check, sorted by recency."""\n pass\n\n @abstractmethod\n def get_latest_asset_check_execution_by_key(\n self, check_keys: Sequence[AssetCheckKey]\n ) -> Mapping[AssetCheckKey, AssetCheckExecutionRecord]:\n """Get the latest executions for a list of asset checks."""\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sql_event_log

\nimport logging\nfrom abc import abstractmethod\nfrom collections import OrderedDict, defaultdict\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.errors import (\n    DagsterEventLogInvalidForRun,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.event_api import RunShardedEventsCursor\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, MARKER_EVENTS, DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.stats import RunStepKeyStatsSnapshot, build_run_step_stats_from_events\nfrom dagster._core.storage.asset_check_execution_record import (\n    AssetCheckExecutionRecord,\n    AssetCheckExecutionRecordStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_case,\n    db_fetch_mappings,\n    db_select,\n    db_subquery,\n)\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._utils import (\n    PrintFn,\n    datetime_as_float,\n    utc_datetime_from_naive,\n    utc_datetime_from_timestamp,\n)\nfrom dagster._utils.concurrency import (\n    ConcurrencyClaimStatus,\n    ConcurrencyKeyInfo,\n    ConcurrencySlotStatus,\n)\n\nfrom ..dagster_run import DagsterRunStatsSnapshot\nfrom .base import (\n    AssetEntry,\n    AssetRecord,\n    EventLogConnection,\n    EventLogCursor,\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import (\n    AssetCheckExecutionsTable,\n    AssetEventTagsTable,\n    AssetKeyTable,\n    ConcurrencySlotsTable,\n    DynamicPartitionsTable,\n    PendingStepsTable,\n    SecondaryIndexMigrationTable,\n    SqlEventLogStorageTable,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\nMAX_CONCURRENCY_SLOTS = 1000\nMIN_ASSET_ROWS = 25\n\n# We are using third-party library objects for DB connections-- at this time, these libraries are\n# untyped. When/if we upgrade to typed variants, the `Any` here can be replaced or the alias as a\n# whole can be dropped.\nSqlDbConnection: TypeAlias = Any\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id: Optional[str]) -> ContextManager[Connection]:\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self) -> ContextManager[Connection]:\n """Context manager yielding a connection to access cross-run indexed tables."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def has_table(self, table_name: str) -> bool:\n """This method checks if a table exists in the database."""\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values(\n run_id=event.run_id,\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_col(self, column_name: str) -> bool:\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return column_name in column_names\n\n def has_asset_key_index_cols(self) -> bool:\n return self.has_asset_key_col("last_materialization_timestamp")\n\n def store_asset_event(self, event: EventLogEntry, event_id: int):\n check.inst_param(event, "event", EventLogEntry)\n\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n values = self._get_asset_entry_values(event, event_id, self.has_asset_key_index_cols())\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(), **values\n )\n update_statement = (\n AssetKeyTable.update()\n .values(**values)\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db_exc.IntegrityError:\n conn.execute(update_statement)\n\n def _get_asset_entry_values(\n self, event: EventLogEntry, event_id: int, has_asset_key_index_cols: bool\n ) -> Dict[str, Any]:\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n entry_values: Dict[str, Any] = {}\n dagster_event = check.not_none(event.dagster_event)\n if dagster_event.is_step_materialization:\n entry_values.update(\n {\n "last_materialization": serialize_value(\n EventLogRecord(\n storage_id=event_id,\n event_log_entry=event,\n )\n ),\n "last_run_id": event.run_id,\n }\n )\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_materialization_planned:\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n entry_values.update({"last_run_id": event.run_id})\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_observation:\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n\n return entry_values\n\n def supports_add_asset_event_tags(self) -> bool:\n return self.has_table(AssetEventTagsTable.name)\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n check.int_param(event_id, "event_id")\n check.float_param(event_timestamp, "event_timestamp")\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n if not self.supports_add_asset_event_tags():\n raise DagsterInvalidInvocationError(\n "In order to add asset event tags, you must run `dagster instance migrate` to "\n "create the AssetEventTags table."\n )\n\n current_tags_list = self.get_event_tags_for_asset(asset_key, filter_event_id=event_id)\n\n asset_key_str = asset_key.to_string()\n\n if len(current_tags_list) == 0:\n current_tags: Mapping[str, str] = {}\n else:\n current_tags = current_tags_list[0]\n\n with self.index_connection() as conn:\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n AssetEventTagsTable.update()\n .where(\n db.and_(\n AssetEventTagsTable.c.event_id == event_id,\n AssetEventTagsTable.c.asset_key == asset_key_str,\n AssetEventTagsTable.c.key == tag,\n )\n )\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=tag,\n value=new_tags[tag],\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event_timestamp),\n )\n for tag in added_tags\n ],\n )\n\n def store_asset_event_tags(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.int_param(event_id, "event_id")\n\n if event.dagster_event and event.dagster_event.asset_key:\n if event.dagster_event.is_step_materialization:\n tags = event.dagster_event.step_materialization_data.materialization.tags\n elif event.dagster_event.is_asset_observation:\n tags = event.dagster_event.asset_observation_data.asset_observation.tags\n else:\n tags = None\n\n if not tags or not self.has_table(AssetEventTagsTable.name):\n # If tags table does not exist, silently exit. This is to support OSS\n # users who have not yet run the migration to create the table.\n # On read, we will throw an error if the table does not exist.\n return\n\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=key,\n value=value,\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n for key, value in tags.items()\n ],\n )\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n event_id = None\n\n with self.run_connection(run_id) as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def get_records_for_run(\n self,\n run_id,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.opt_str_param(cursor, "cursor")\n\n check.invariant(not of_type or isinstance(of_type, (DagsterEventType, frozenset, set)))\n\n dagster_event_types = (\n {of_type}\n if isinstance(of_type, DagsterEventType)\n else check.opt_set_param(of_type, "dagster_event_type", of_type=DagsterEventType)\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(\n SqlEventLogStorageTable.c.id.asc()\n if ascending\n else SqlEventLogStorageTable.c.id.desc()\n )\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n if cursor is not None:\n cursor_obj = EventLogCursor.parse(cursor)\n if cursor_obj.is_offset_cursor():\n query = query.offset(cursor_obj.offset())\n elif cursor_obj.is_id_cursor():\n if ascending:\n query = query.where(SqlEventLogStorageTable.c.id > cursor_obj.storage_id())\n else:\n query = query.where(SqlEventLogStorageTable.c.id < cursor_obj.storage_id())\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n last_record_id = None\n try:\n records = []\n for (\n record_id,\n json_str,\n ) in results:\n records.append(\n EventLogRecord(\n storage_id=record_id,\n event_log_entry=deserialize_value(json_str, EventLogEntry),\n )\n )\n last_record_id = record_id\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n if last_record_id is not None:\n next_cursor = EventLogCursor.from_storage_id(last_record_id).to_string()\n elif cursor:\n # record fetch returned no new logs, return the same cursor\n next_cursor = cursor\n else:\n # rely on the fact that all storage ids will be positive integers\n next_cursor = EventLogCursor.from_storage_id(-1).to_string()\n\n return EventLogConnection(\n records=records,\n cursor=next_cursor,\n has_more=bool(limit and len(results) == limit),\n )\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n check.str_param(run_id, "run_id")\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None, # noqa: E711\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return DagsterRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db_select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None) # noqa: E711\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n ]\n + [marker_event.value for marker_event in MARKER_EVENTS]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [deserialize_value(json_str, EventLogEntry) for (json_str,) in results]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log table."""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset_keys table."""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self) -> None:\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n self._wipe_index()\n\n def _wipe_index(self):\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n self.free_concurrency_slots_for_run(run_id)\n\n def delete_events_for_run(self, conn: Connection, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n conn.execute(\n SqlEventLogStorageTable.delete().where(SqlEventLogStorageTable.c.run_id == run_id)\n )\n\n @property\n def is_persistent(self) -> bool:\n return True\n\n def update_event_log_record(self, record_id: int, event: EventLogEntry) -> None:\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value # type: ignore\n if event.dagster_event.asset_key: # type: ignore\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey) # type: ignore\n asset_key_str = event.dagster_event.asset_key.to_string() # type: ignore\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update()\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id: str, record_id: int) -> Optional[SqlAlchemyRow]:\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering).\n """\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db_select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name: str) -> bool:\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name: str) -> None:\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = SecondaryIndexMigrationTable.insert().values(\n name=name,\n migration_completed=datetime.now(),\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n event_records_filter: EventRecordsFilter,\n asset_details: Optional[AssetDetails] = None,\n apply_cursor_filters: bool = True,\n ) -> SqlAlchemyQuery:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n SqlEventLogStorageTable.c.asset_key == event_records_filter.asset_key.to_string(),\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor_id)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n if event_records_filter.storage_ids:\n query = query.where(SqlEventLogStorageTable.c.id.in_(event_records_filter.storage_ids))\n\n if event_records_filter.tags and self.has_table(AssetEventTagsTable.name):\n # If we don't have the tags table, we'll filter the results after the query\n check.invariant(\n isinstance(event_records_filter.asset_key, AssetKey),\n "Asset key must be set in event records filter to filter by tags.",\n )\n if self.supports_intersect:\n intersections = [\n db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key\n == event_records_filter.asset_key.to_string(), # type: ignore # (bad sig?)\n AssetEventTagsTable.c.key == key,\n (\n AssetEventTagsTable.c.value == value\n if isinstance(value, str)\n else AssetEventTagsTable.c.value.in_(value)\n ),\n )\n )\n for key, value in event_records_filter.tags.items()\n ]\n query = query.where(SqlEventLogStorageTable.c.id.in_(db.intersect(*intersections)))\n\n return query\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n asset_key: Optional[AssetKey],\n ) -> db.Table:\n event_id_col = table.c.id if table == SqlEventLogStorageTable else table.c.event_id\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = db_subquery(\n db_select([AssetEventTagsTable]), f"asset_event_tags_subquery_{i}"\n )\n table = table.join(\n tags_table,\n db.and_(\n event_id_col == tags_table.c.event_id,\n not asset_key or tags_table.c.asset_key == asset_key.to_string(),\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if (\n event_records_filter.tags\n and not self.supports_intersect\n and self.has_table(AssetEventTagsTable.name)\n ):\n table = self._apply_tags_table_joins(\n SqlEventLogStorageTable, event_records_filter.tags, event_records_filter.asset_key\n )\n else:\n table = SqlEventLogStorageTable\n\n query = db_select(\n [SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]\n ).select_from(table)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, NamedTuple)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n\n if event_records_filter.tags and not self.has_table(AssetEventTagsTable.name):\n # If we can't filter tags via the tags table, filter the returned records\n if limit is not None:\n raise DagsterInvalidInvocationError(\n "Cannot filter events on tags with a limit, without the asset event "\n "tags table. To fix, run `dagster instance migrate`."\n )\n\n event_record_tags = event_record.tags\n if not event_record_tags or any(\n event_record_tags.get(k) != v for k, v in event_records_filter.tags.items()\n ):\n continue\n\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n return event_records\n\n def supports_event_consumer_queries(self) -> bool:\n return True\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, EventLogEntry]:\n check.int_param(after_cursor, "after_cursor")\n check.invariant(\n after_cursor >= -1,\n f"Don't know what to do with negative cursor {after_cursor}",\n )\n dagster_event_types = (\n {dagster_event_type}\n if isinstance(dagster_event_type, DagsterEventType)\n else check.opt_set_param(\n dagster_event_type, "dagster_event_type", of_type=DagsterEventType\n )\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.id > after_cursor)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n if limit:\n query = query.limit(limit)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n record_id = None\n try:\n for (\n record_id,\n json_str,\n ) in results:\n events[record_id] = deserialize_value(json_str, EventLogEntry)\n except (seven.JSONDecodeError, DeserializationError):\n logging.warning("Could not parse event record id `%s`.", record_id)\n\n return events\n\n def get_maximum_record_id(self) -> Optional[int]:\n with self.index_connection() as conn:\n result = conn.execute(db_select([db.func.max(SqlEventLogStorageTable.c.id)])).fetchone()\n return result[0] # type: ignore\n\n def _construct_asset_record_from_row(\n self,\n row,\n last_materialization_record: Optional[EventLogRecord],\n can_cache_asset_status_data: bool,\n ) -> AssetRecord:\n from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n return AssetRecord(\n storage_id=row["id"],\n asset_entry=AssetEntry(\n asset_key=asset_key,\n last_materialization_record=last_materialization_record,\n last_run_id=row["last_run_id"],\n asset_details=AssetDetails.from_db_string(row["asset_details"]),\n cached_status=(\n AssetStatusCacheValue.from_db_string(row["cached_status_data"])\n if can_cache_asset_status_data\n else None\n ),\n ),\n )\n else:\n check.failed("Row did not contain asset key.")\n\n def _get_latest_materialization_records(\n self, raw_asset_rows\n ) -> Mapping[AssetKey, Optional[EventLogRecord]]:\n # Given a list of raw asset rows, returns a mapping of asset key to latest asset materialization\n # event log entry. Fetches backcompat EventLogEntry records when the last_materialization\n # in the raw asset row is an AssetMaterialization.\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogRecord]] = {}\n for row in raw_asset_rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_value(row["last_materialization"], NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(event_or_materialization, EventLogRecord):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n latest_event_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key),\n "latest_event_subquery",\n )\n backcompat_query = db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.id,\n SqlEventLogStorageTable.c.event,\n ]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.id == latest_event_subquery.c.id,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = db_fetch_mappings(conn, backcompat_query)\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row["asset_key"]))\n if asset_key:\n results[asset_key] = EventLogRecord(\n storage_id=cast(int, row["id"]),\n event_log_entry=deserialize_value(cast(str, row["event"]), EventLogEntry),\n )\n return results\n\n def can_cache_asset_status_data(self) -> bool:\n return self.has_asset_key_col("cached_status_data")\n\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n if self.can_cache_asset_status_data():\n check.inst_param(asset_key, "asset_key", AssetKey)\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(dict(cached_status_data=None))\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n latest_materialization_records = self._get_latest_materialization_records(rows)\n can_cache_asset_status_data = self.can_cache_asset_status_data()\n\n asset_records: List[AssetRecord] = []\n for row in rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n asset_records.append(\n self._construct_asset_record_from_row(\n row,\n latest_materialization_records.get(asset_key),\n can_cache_asset_status_data,\n )\n )\n\n return asset_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.iterable_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n return {\n asset_key: event_log_record.event_log_entry if event_log_record is not None else None\n for asset_key, event_log_record in self._get_latest_materialization_records(\n rows\n ).items()\n }\n\n def _fetch_asset_rows(\n self,\n asset_keys=None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[SqlAlchemyRow]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows): # type: ignore\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor=None,\n ) -> Tuple[Iterable[SqlAlchemyRow], bool, Optional[str]]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n # TODO update comment\n\n columns = [\n AssetKeyTable.c.id,\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.last_run_id,\n AssetKeyTable.c.asset_details,\n ]\n if self.can_cache_asset_status_data():\n columns.extend([AssetKeyTable.c.cached_status_data])\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db_select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp.is_(None),\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n wiped_timestamps_by_asset_key: Dict[AssetKey, float] = {}\n row_by_asset_key: Dict[AssetKey, SqlAlchemyRow] = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(cast(str, row["asset_key"]))\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row["asset_details"])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event_or_record = (\n deserialize_value(cast(str, row["last_materialization"]), NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(materialization_or_event_or_record, (EventLogRecord, EventLogEntry)):\n if isinstance(materialization_or_event_or_record, EventLogRecord):\n event_timestamp = materialization_or_event_or_record.event_log_entry.timestamp\n else:\n event_timestamp = materialization_or_event_or_record.timestamp\n\n if asset_details.last_wipe_timestamp > event_timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys() # type: ignore\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1]["id"] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor # type: ignore\n\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n if self.can_cache_asset_status_data():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n .values(cached_status_data=serialize_value(cache_values))\n )\n\n def _fetch_backcompat_materialization_times(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, datetime]:\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = db_fetch_mappings(conn, backcompat_query)\n return {AssetKey.from_db_string(row["asset_key"]): row["timestamp"] for row in backcompat_rows} # type: ignore\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix=None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> SqlAlchemyQuery:\n if asset_keys is not None:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(\n self, asset_keys: Sequence[AssetKey]\n ) -> Sequence[Optional[AssetDetails]]:\n check.sequence_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = db_fetch_mappings(\n conn,\n db_select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n ),\n )\n\n asset_key_to_details = {\n cast(str, row["asset_key"]): (\n deserialize_value(cast(str, row["asset_details"]), AssetDetails)\n if row["asset_details"]\n else None\n )\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n assets_details: Sequence[Optional[AssetDetails]],\n asset_keys: Sequence[AssetKey],\n ) -> SqlAlchemyQuery:\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp:\n asset_key_in_row = SqlEventLogStorageTable.c.asset_key == asset_key.to_string()\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp),\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, fetches only tags applied to the given event.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n filter_tags = check.opt_mapping_param(\n filter_tags, "filter_tags", key_type=str, value_type=str\n )\n filter_event_id = check.opt_int_param(filter_event_id, "filter_event_id")\n\n if not self.has_table(AssetEventTagsTable.name):\n raise DagsterInvalidInvocationError(\n "In order to search for asset event tags, you must run "\n "`dagster instance migrate` to create the AssetEventTags table."\n )\n\n asset_details = self._get_assets_details([asset_key])[0]\n if not filter_tags:\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(AssetEventTagsTable.c.asset_key == asset_key.to_string())\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n elif self.supports_intersect:\n\n def get_tag_filter_query(tag_key, tag_value):\n filter_query = db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key == asset_key.to_string(),\n AssetEventTagsTable.c.key == tag_key,\n AssetEventTagsTable.c.value == tag_value,\n )\n )\n if asset_details and asset_details.last_wipe_timestamp:\n filter_query = filter_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n return filter_query\n\n intersections = [\n get_tag_filter_query(tag_key, tag_value)\n for tag_key, tag_value in filter_tags.items()\n ]\n\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(\n db.and_(\n AssetEventTagsTable.c.event_id.in_(db.intersect(*intersections)),\n )\n )\n else:\n table = self._apply_tags_table_joins(AssetEventTagsTable, filter_tags, asset_key)\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).select_from(table)\n\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if filter_event_id is not None:\n tags_query = tags_query.where(AssetEventTagsTable.c.event_id == filter_event_id)\n\n with self.index_connection() as conn:\n results = conn.execute(tags_query).fetchall()\n\n tags_by_event_id: Dict[int, Dict[str, str]] = defaultdict(dict)\n for row in results:\n key, value, event_id = row\n tags_by_event_id[event_id][key] = value\n\n return list(tags_by_event_id.values())\n\n def _asset_materialization_from_json_column(\n self, json_str: str\n ) -> Optional[AssetMaterialization]:\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_value(json_str, NamedTuple)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key # type: ignore\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization # type: ignore\n\n def _get_asset_key_values_on_wipe(self) -> Mapping[str, Any]:\n wipe_timestamp = pendulum.now("UTC").timestamp()\n values = {\n "asset_details": serialize_value(AssetDetails(last_wipe_timestamp=wipe_timestamp)),\n "last_run_id": None,\n }\n if self.has_asset_key_index_cols():\n values.update(\n dict(\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n )\n )\n if self.can_cache_asset_status_data():\n values.update(dict(cached_status_data=None))\n return values\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n check.inst_param(asset_key, "asset_key", AssetKey)\n wiped_values = self._get_asset_key_values_on_wipe()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(**wiped_values)\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details([asset_key])\n query = self._add_assets_wipe_filter_to_query(query, assets_details, [asset_key])\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n if before_cursor:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return set([cast(str, row[0]) for row in results])\n\n def get_materialization_count_by_partition(\n self,\n asset_keys: Sequence[AssetKey],\n after_cursor: Optional[int] = None,\n before_cursor: Optional[int] = None,\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.sequence_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row[0]))\n if asset_key:\n materialization_count_by_partition[asset_key][cast(str, row[1])] = cast(int, row[2])\n\n return materialization_count_by_partition\n\n def _latest_event_ids_by_partition_subquery(\n self,\n asset_key: AssetKey,\n event_types: Sequence[DagsterEventType],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ):\n """Subquery for locating the latest event ids by partition for a given asset key and set\n of event types.\n """\n query = db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n ).where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [event_type.value for event_type in event_types]\n ),\n )\n )\n if asset_partitions is not None:\n query = query.where(SqlEventLogStorageTable.c.partition.in_(asset_partitions))\n if before_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n if after_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n latest_event_ids_subquery = query.group_by(\n SqlEventLogStorageTable.c.dagster_event_type, SqlEventLogStorageTable.c.partition\n )\n\n assets_details = self._get_assets_details([asset_key])\n return db_subquery(\n self._add_assets_wipe_filter_to_query(\n latest_event_ids_subquery, assets_details, [asset_key]\n ),\n "latest_event_ids_by_partition_subquery",\n )\n\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_by_partition_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key, [event_type]\n )\n latest_event_ids_by_partition = db_select(\n [\n latest_event_ids_by_partition_subquery.c.partition,\n latest_event_ids_by_partition_subquery.c.id,\n ]\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(latest_event_ids_by_partition).fetchall()\n\n latest_materialization_storage_id_by_partition: Dict[str, int] = {}\n for row in rows:\n latest_materialization_storage_id_by_partition[cast(str, row[0])] = cast(int, row[1])\n return latest_materialization_storage_id_by_partition\n\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.inst_param(event_type, "event_type", DagsterEventType)\n check.sequence_param(tag_keys, "tag_keys", of_type=str)\n check.opt_nullable_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.opt_int_param(before_cursor, "before_cursor")\n check.opt_int_param(after_cursor, "after_cursor")\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key=asset_key,\n event_types=[event_type],\n asset_partitions=asset_partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n )\n\n latest_tags_by_partition_query = (\n db_select(\n [\n latest_event_ids_subquery.c.partition,\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n ]\n )\n .select_from(\n latest_event_ids_subquery.join(\n AssetEventTagsTable,\n AssetEventTagsTable.c.event_id == latest_event_ids_subquery.c.id,\n )\n )\n .where(AssetEventTagsTable.c.key.in_(tag_keys))\n )\n\n latest_tags_by_partition: Dict[str, Dict[str, str]] = defaultdict(dict)\n with self.index_connection() as conn:\n rows = conn.execute(latest_tags_by_partition_query).fetchall()\n\n for row in rows:\n latest_tags_by_partition[cast(str, row[0])][cast(str, row[1])] = cast(str, row[2])\n\n # convert defaultdict to dict\n return dict(latest_tags_by_partition)\n\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n """Fetch the latest materialzation and materialization planned events for each partition of the given asset.\n Return the partitions that have a materialization planned event but no matching (same run) materialization event.\n These materializations could be in progress, or they could have failed. A separate query checking the run status\n is required to know.\n\n Returns a mapping of partition to [run id, event id].\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key,\n [\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n ],\n )\n\n latest_events_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n SqlEventLogStorageTable.c.run_id,\n SqlEventLogStorageTable.c.id,\n ]\n ).select_from(\n latest_event_ids_subquery.join(\n SqlEventLogStorageTable,\n SqlEventLogStorageTable.c.id == latest_event_ids_subquery.c.id,\n ),\n ),\n "latest_events_subquery",\n )\n\n materialization_planned_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n latest_events_subquery.c.id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value\n )\n\n materialization_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value\n )\n\n with self.index_connection() as conn:\n materialization_planned_rows = db_fetch_mappings(conn, materialization_planned_events)\n materialization_rows = db_fetch_mappings(conn, materialization_events)\n\n materialization_planned_rows_by_partition = {\n cast(str, row["partition"]): (cast(str, row["run_id"]), cast(int, row["id"]))\n for row in materialization_planned_rows\n }\n for row in materialization_rows:\n if (\n row["partition"] in materialization_planned_rows_by_partition\n and materialization_planned_rows_by_partition[cast(str, row["partition"])][0]\n == row["run_id"]\n ):\n materialization_planned_rows_by_partition.pop(cast(str, row["partition"]))\n\n return materialization_planned_rows_by_partition\n\n def _check_partitions_table(self) -> None:\n # Guards against cases where the user is not running the latest migration for\n # partitions storage. Should be updated when the partitions storage schema changes.\n if not self.has_table("dynamic_partitions"):\n raise DagsterInvalidInvocationError(\n "Using dynamic partitions definitions requires the dynamic partitions table, which"\n " currently does not exist. Add this table by running `dagster"\n " instance migrate`."\n )\n\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a partition definition."""\n self._check_partitions_table()\n columns = [\n DynamicPartitionsTable.c.partitions_def_name,\n DynamicPartitionsTable.c.partition,\n ]\n query = (\n db_select(columns)\n .where(DynamicPartitionsTable.c.partitions_def_name == partitions_def_name)\n .order_by(DynamicPartitionsTable.c.id)\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [cast(str, row[1]) for row in rows]\n\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n self._check_partitions_table()\n query = (\n db_select([DynamicPartitionsTable.c.partition])\n .where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n existing_rows = conn.execute(\n db_select([DynamicPartitionsTable.c.partition]).where(\n db.and_(\n DynamicPartitionsTable.c.partition.in_(partition_keys),\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n )\n )\n ).fetchall()\n existing_keys = set([row[0] for row in existing_rows])\n new_keys = [\n partition_key\n for partition_key in partition_keys\n if partition_key not in existing_keys\n ]\n\n if new_keys:\n conn.execute(\n DynamicPartitionsTable.insert(),\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in new_keys\n ],\n )\n\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n DynamicPartitionsTable.delete().where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n )\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return self.has_table(ConcurrencySlotsTable.name)\n\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate a set of concurrency slots.\n\n Args:\n concurrency_key (str): The key to allocate the slots for.\n num (int): The number of slots to allocate.\n """\n if num > MAX_CONCURRENCY_SLOTS:\n raise DagsterInvalidInvocationError(\n f"Cannot have more than {MAX_CONCURRENCY_SLOTS} slots per concurrency key."\n )\n if num < 0:\n raise DagsterInvalidInvocationError("Cannot have a negative number of slots.")\n\n keys_to_assign = None\n with self.index_connection() as conn:\n count_row = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n existing = cast(int, count_row[0]) if count_row else 0\n\n if existing > num:\n # need to delete some slots, favoring ones where the slot is unallocated\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .order_by(\n db_case([(ConcurrencySlotsTable.c.run_id.is_(None), 1)], else_=0).desc(),\n ConcurrencySlotsTable.c.id.desc(),\n )\n .limit(existing - num)\n ).fetchall()\n\n if rows:\n # mark rows as deleted\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(deleted=True)\n .where(ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]))\n )\n\n # actually delete rows that are marked as deleted and are not claimed... the rest\n # will be deleted when the slots are released by the free_concurrency_slots\n conn.execute(\n ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n ConcurrencySlotsTable.c.run_id == None, # noqa: E711\n )\n )\n )\n elif num > existing:\n # need to add some slots\n rows = [\n {\n "concurrency_key": concurrency_key,\n "run_id": None,\n "step_key": None,\n "deleted": False,\n }\n for _ in range(existing, num)\n ]\n conn.execute(ConcurrencySlotsTable.insert().values(rows))\n keys_to_assign = [concurrency_key for _ in range(existing, num)]\n\n if keys_to_assign:\n # we've added some slots... if there are any pending steps, we can assign them now or\n # they will be unutilized until free_concurrency_slots is called\n self.assign_pending_steps(keys_to_assign)\n\n def has_unassigned_slots(self, concurrency_key: str) -> bool:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.assigned_timestamp != None, # noqa: E711\n )\n )\n ).fetchone()\n slots = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n pending_count = cast(int, pending_row[0]) if pending_row else 0\n slots_count = cast(int, slots[0]) if slots else 0\n return slots_count > pending_count\n\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select(\n [\n PendingStepsTable.c.assigned_timestamp,\n PendingStepsTable.c.priority,\n PendingStepsTable.c.create_timestamp,\n ]\n ).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n\n if not pending_row:\n # no pending step pending_row exists, the slot is blocked and the enqueued timestamp is None\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=None,\n assigned_timestamp=None,\n enqueued_timestamp=None,\n )\n\n priority = cast(int, pending_row[1]) if pending_row[1] else None\n assigned_timestamp = cast(datetime, pending_row[0]) if pending_row[0] else None\n create_timestamp = cast(datetime, pending_row[2]) if pending_row[2] else None\n if assigned_timestamp is None:\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=priority,\n assigned_timestamp=None,\n enqueued_timestamp=create_timestamp,\n )\n\n # pending step is assigned, check to see if it's been claimed\n slot_row = conn.execute(\n db_select([db.func.count()]).where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=(\n ConcurrencySlotStatus.CLAIMED\n if slot_row and slot_row[0]\n else ConcurrencySlotStatus.BLOCKED\n ),\n priority=priority,\n assigned_timestamp=assigned_timestamp,\n enqueued_timestamp=create_timestamp,\n )\n\n def can_claim_from_pending(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([PendingStepsTable.c.assigned_timestamp]).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n return row and row[0] is not None\n\n def has_pending_step(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n return row and cast(int, row[0]) > 0\n\n def assign_pending_steps(self, concurrency_keys: Sequence[str]):\n if not concurrency_keys:\n return\n\n with self.index_connection() as conn:\n for key in concurrency_keys:\n row = conn.execute(\n db_select([PendingStepsTable.c.id])\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == key,\n PendingStepsTable.c.assigned_timestamp == None, # noqa: E711\n )\n )\n .order_by(\n PendingStepsTable.c.priority.desc(),\n PendingStepsTable.c.create_timestamp.asc(),\n )\n .limit(1)\n ).fetchone()\n if row:\n conn.execute(\n PendingStepsTable.update()\n .where(PendingStepsTable.c.id == row[0])\n .values(assigned_timestamp=db.func.now())\n )\n\n def add_pending_step(\n self,\n concurrency_key: str,\n run_id: str,\n step_key: str,\n priority: Optional[int] = None,\n should_assign: bool = False,\n ):\n with self.index_connection() as conn:\n try:\n conn.execute(\n PendingStepsTable.insert().values(\n [\n dict(\n run_id=run_id,\n step_key=step_key,\n concurrency_key=concurrency_key,\n priority=priority or 0,\n assigned_timestamp=db.func.now() if should_assign else None,\n )\n ]\n )\n )\n except db_exc.IntegrityError:\n # do nothing\n pass\n\n def _remove_pending_steps(self, run_id: str, step_key: Optional[str] = None):\n query = PendingStepsTable.delete().where(PendingStepsTable.c.run_id == run_id)\n if step_key:\n query = query.where(PendingStepsTable.c.step_key == step_key)\n with self.index_connection() as conn:\n conn.execute(query)\n\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slot for step.\n\n Args:\n concurrency_keys (str): The concurrency key to claim.\n run_id (str): The run id to claim for.\n step_key (str): The step key to claim for.\n """\n # first, register the step by adding to pending queue\n if not self.has_pending_step(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n ):\n has_unassigned_slots = self.has_unassigned_slots(concurrency_key)\n self.add_pending_step(\n concurrency_key=concurrency_key,\n run_id=run_id,\n step_key=step_key,\n priority=priority,\n should_assign=has_unassigned_slots,\n )\n\n # if the step is not assigned (i.e. has not been popped from queue), block the claim\n claim_status = self.check_concurrency_claim(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n if claim_status.is_claimed or not claim_status.is_assigned:\n return claim_status\n\n # attempt to claim a concurrency slot... this should generally work because we only assign\n # based on the number of unclaimed slots, but this should act as a safeguard, using the slot\n # rows as a semaphore\n slot_status = self._claim_concurrency_slot(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n return claim_status.with_slot_status(slot_status)\n\n def _claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencySlotStatus:\n """Claim a concurrency slot for the step. Helper method that is called for steps that are\n popped off the priority queue.\n\n Args:\n concurrency_key (str): The concurrency key to claim.\n run_id (str): The run id to claim a slot for.\n step_key (str): The step key to claim a slot for.\n """\n with self.index_connection() as conn:\n result = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.step_key == None, # noqa: E711\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .with_for_update(skip_locked=True)\n .limit(1)\n ).fetchone()\n if not result or not result[0]:\n return ConcurrencySlotStatus.BLOCKED\n if not conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=run_id, step_key=step_key)\n .where(ConcurrencySlotsTable.c.id == result[0])\n ).rowcount:\n return ConcurrencySlotStatus.BLOCKED\n\n return ConcurrencySlotStatus.CLAIMED\n\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n with self.index_connection() as conn:\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.deleted == False) # noqa: E712\n .distinct()\n ).fetchall()\n return {cast(str, row[0]) for row in rows}\n\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get the list of concurrency slots for a given concurrency key.\n\n Args:\n concurrency_key (str): The concurrency key to get the slots for.\n\n Returns:\n List[Tuple[str, int]]: A list of tuples of run_id and the number of slots it is\n occupying for the given concurrency key.\n """\n with self.index_connection() as conn:\n slot_query = (\n db_select(\n [\n ConcurrencySlotsTable.c.run_id,\n ConcurrencySlotsTable.c.deleted,\n db.func.count().label("count"),\n ]\n )\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.concurrency_key == concurrency_key)\n .group_by(ConcurrencySlotsTable.c.run_id, ConcurrencySlotsTable.c.deleted)\n )\n slot_rows = db_fetch_mappings(conn, slot_query)\n pending_query = (\n db_select(\n [\n PendingStepsTable.c.run_id,\n db_case(\n [(PendingStepsTable.c.assigned_timestamp.is_(None), False)],\n else_=True,\n ).label("is_assigned"),\n db.func.count().label("count"),\n ]\n )\n .select_from(PendingStepsTable)\n .where(PendingStepsTable.c.concurrency_key == concurrency_key)\n .group_by(PendingStepsTable.c.run_id, "is_assigned")\n )\n pending_rows = db_fetch_mappings(conn, pending_query)\n\n return ConcurrencyKeyInfo(\n concurrency_key=concurrency_key,\n slot_count=sum(\n [\n cast(int, slot_row["count"])\n for slot_row in slot_rows\n if not slot_row["deleted"]\n ]\n ),\n active_slot_count=sum(\n [cast(int, slot_row["count"]) for slot_row in slot_rows if slot_row["run_id"]]\n ),\n active_run_ids={\n cast(str, slot_row["run_id"]) for slot_row in slot_rows if slot_row["run_id"]\n },\n pending_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if not row["is_assigned"]]\n ),\n pending_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if not row["is_assigned"]\n },\n assigned_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if row["is_assigned"]]\n ),\n assigned_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if row["is_assigned"]\n },\n )\n\n def get_concurrency_run_ids(self) -> Set[str]:\n with self.index_connection() as conn:\n rows = conn.execute(db_select([PendingStepsTable.c.run_id]).distinct()).fetchall()\n return set([cast(str, row[0]) for row in rows])\n\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id)\n self._remove_pending_steps(run_id=run_id)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id, step_key=step_key)\n self._remove_pending_steps(run_id=run_id, step_key=step_key)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def _free_concurrency_slots(self, run_id: str, step_key: Optional[str] = None) -> Sequence[str]:\n """Frees concurrency slots for a given run/step.\n\n Args:\n run_id (str): The run id to free the slots for.\n step_key (Optional[str]): The step key to free the slots for. If not provided, all the\n slots for all the steps of the run will be freed.\n """\n with self.index_connection() as conn:\n # first delete any rows that apply and are marked as deleted. This happens when the\n # configured number of slots has been reduced, and some of the pruned slots included\n # ones that were already allocated to the run/step\n delete_query = ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n )\n )\n if step_key:\n delete_query = delete_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n conn.execute(delete_query)\n\n # next, fetch the slots to free up, while grabbing the concurrency keys so that we can\n # allocate any pending steps from the queue for the freed slots, if necessary\n select_query = (\n db_select([ConcurrencySlotsTable.c.id, ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.run_id == run_id)\n .with_for_update(skip_locked=True)\n )\n if step_key:\n select_query = select_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n rows = conn.execute(select_query).fetchall()\n if not rows:\n return []\n\n # now, actually free the slots\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=None, step_key=None)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]),\n )\n )\n )\n\n # return the concurrency keys for the freed slots\n return [cast(str, row[1]) for row in rows]\n\n def store_asset_check_event(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.opt_int_param(event_id, "event_id")\n\n check.invariant(\n self.supports_asset_checks,\n "Asset checks require a database schema migration. Run `dagster instance migrate`.",\n )\n\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n self._store_asset_check_evaluation_planned(event, event_id)\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n if event.run_id == "" or event.run_id is None:\n self._store_runless_asset_check_evaluation(event, event_id)\n else:\n self._update_asset_check_evaluation(event, event_id)\n\n def _store_asset_check_evaluation_planned(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n planned = cast(\n AssetCheckEvaluationPlanned, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=planned.asset_key.to_string(),\n check_name=planned.check_name,\n run_id=event.run_id,\n execution_status=AssetCheckExecutionRecordStatus.PLANNED.value,\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n )\n\n def _store_runless_asset_check_evaluation(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=evaluation.asset_key.to_string(),\n check_name=evaluation.check_name,\n run_id=event.run_id,\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n )\n\n def _update_asset_check_evaluation(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n rows_updated = conn.execute(\n AssetCheckExecutionsTable.update()\n .where(\n # (asset_key, check_name, run_id) uniquely identifies the row created for the planned event\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == evaluation.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == evaluation.check_name,\n AssetCheckExecutionsTable.c.run_id == event.run_id,\n )\n )\n .values(\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n ).rowcount\n if rows_updated != 1:\n raise DagsterInvariantViolationError(\n "Expected to update one row for asset check evaluation, but updated"\n f" {rows_updated}."\n )\n\n def get_asset_check_execution_history(\n self,\n check_key: AssetCheckKey,\n limit: int,\n cursor: Optional[int] = None,\n ) -> Sequence[AssetCheckExecutionRecord]:\n check.inst_param(check_key, "key", AssetCheckKey)\n check.int_param(limit, "limit")\n check.opt_int_param(cursor, "cursor")\n\n query = (\n db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == check_key.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == check_key.name,\n )\n )\n .order_by(AssetCheckExecutionsTable.c.id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetCheckExecutionsTable.c.id < cursor)\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return [AssetCheckExecutionRecord.from_db_row(row) for row in rows]\n\n def get_latest_asset_check_execution_by_key(\n self, check_keys: Sequence[AssetCheckKey]\n ) -> Mapping[AssetCheckKey, AssetCheckExecutionRecord]:\n if not check_keys:\n return {}\n\n latest_ids_subquery = db_subquery(\n db_select(\n [\n db.func.max(AssetCheckExecutionsTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key.in_(\n [key.asset_key.to_string() for key in check_keys]\n ),\n AssetCheckExecutionsTable.c.check_name.in_([key.name for key in check_keys]),\n )\n )\n .group_by(\n AssetCheckExecutionsTable.c.asset_key,\n AssetCheckExecutionsTable.c.check_name,\n )\n )\n\n query = db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.asset_key,\n AssetCheckExecutionsTable.c.check_name,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n ).select_from(\n AssetCheckExecutionsTable.join(\n latest_ids_subquery,\n db.and_(\n AssetCheckExecutionsTable.c.id == latest_ids_subquery.c.id,\n ),\n )\n )\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return {\n AssetCheckKey(\n asset_key=check.not_none(AssetKey.from_db_string(cast(str, row["asset_key"]))),\n name=cast(str, row["check_name"]),\n ): AssetCheckExecutionRecord.from_db_row(row)\n for row in rows\n }\n\n @property\n def supports_asset_checks(self):\n return self.has_table(AssetCheckExecutionsTable.name)
\n\n\ndef _get_from_row(row: SqlAlchemyRow, column: str) -> object:\n """Utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3.\n """\n if column not in row.keys():\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sql_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nfrom dagster._config import StringSource\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlDbConnection, SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def run_connection(self, run_id: Optional[str]) -> SqlDbConnection:\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n return bool(engine.dialect.has_table(engine.connect(), table_name))\n\n def get_db_path(self):\n return os.path.join(self._base_dir, f"{SQLITE_EVENT_LOG_FILENAME}.db")\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n self._watchers[run_id][callback] = cursor\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n connection = self.get_records_for_run(run_id, cursor)\n\n # update cursor\n if connection.cursor:\n self._watchers[run_id][callback] = connection.cursor\n\n for record in connection.records:\n status = None\n try:\n status = callback(\n record.event_log_entry,\n str(EventLogCursor.from_storage_id(record.storage_id)),\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.sqlite_event_log

\nimport contextlib\nimport glob\nimport logging\nimport os\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, ContextManager, Iterable, Iterator, Optional, Sequence\n\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection, Engine\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import FileSystemEvent, PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._config import StringSource\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, EVENT_TYPE_TO_PIPELINE_RUN_STATUS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.dagster_run import DagsterRunStatus, RunsFilter\nfrom dagster._core.storage.event_log.base import EventLogCursor, EventLogRecord, EventRecordsFilter\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file insqliteve\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster._core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database.\n """\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self) -> None:\n all_run_ids = self.get_all_run_ids()\n print(f"Updating event log storage for {len(all_run_ids)} runs on disk...") # noqa: T201\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # noqa: T201\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteEventLogStorage":\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self) -> Sequence[str]:\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def has_table(self, table_name: str) -> bool:\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n return bool(engine.dialect.has_table(conn, table_name))\n\n def path_for_shard(self, run_id: str) -> str:\n return os.path.join(self._base_dir, f"{run_id}.db")\n\n def conn_string_for_shard(self, shard_name: str) -> str:\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine: Engine) -> None:\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db_exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagster-webserver process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n re.search(r"table [A-Za-z_]* already exists", err_msg)\n or "database is locked" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying (%s retries left). Exception: %s",\n retry_limit,\n err_msg,\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard: str) -> Iterator[Connection]:\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if shard not in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n engine.dispose()\n\n def run_connection(self, run_id: Optional[str] = None) -> Any:\n return self._connect(run_id) # type: ignore # bad sig\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key: # type: ignore\n check.invariant(\n event.dagster_event_type in ASSET_EVENTS,\n "Can only store asset materializations, materialization_planned, and"\n " observations in index database",\n )\n\n event_id = None\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, None)\n\n if event.is_dagster_event and event.dagster_event_type in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n # should mirror run status change events in the index shard\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and event_records_filter.event_type in ASSET_EVENTS\n if is_asset_query:\n # asset materializations, observations and materialization planned events get mirrored\n # into the index shard, so no custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if event_records_filter.after_cursor is not None and not isinstance(\n event_records_filter.after_cursor, RunShardedEventsCursor\n ):\n raise Exception("""\n Called `get_event_records` on a run-sharded event log storage with a cursor that\n is not run-aware. Add a RunShardedEventsCursor to your query filter\n or switch your instance configuration to use a non-run-sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.dagster_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, EventLogEntry)\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except DeserializationError:\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self) -> None:\n # should delete all the run-sharded db files and drop the contents of the index\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n if (\n not filename.endswith(f"{INDEX_SHARD_NAME}.db")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-wal")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-shm")\n ):\n with contextlib.suppress(FileNotFoundError):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n self._wipe_index()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key: AssetKey) -> None:\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch) # type: ignore # (possible none)\n del self._watchers[run_id][handler]\n\n def dispose(self) -> None:\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.index_connection() as conn:\n return check_alembic_revision(alembic_config, conn)\n\n @property\n def is_run_sharded(self) -> bool:\n return True\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(\n self,\n event_log_storage: SqliteEventLogStorage,\n run_id: str,\n callback: EventHandlerFn,\n cursor: Optional[str],\n **kwargs: Any,\n ):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = cursor\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self) -> None:\n connection = self._event_log_storage.get_records_for_run(self._run_id, self._cursor)\n if connection.cursor:\n self._cursor = connection.cursor\n for record in connection.records:\n status = None\n try:\n status = self._cb(\n record.event_log_entry, str(EventLogCursor.from_storage_id(record.storage_id))\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event: FileSystemEvent) -> None:\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, ContextManager, Iterator, Optional, TextIO, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource, resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\nIOStream: TypeAlias = Union[TextIO, BinaryIO]\n\n\n
[docs]class FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager.\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @public\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]class LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @public\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @public\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC):\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @public\n @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def delete_local_temp(self) -> None:\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster._core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> ContextManager[IOStream]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write(self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context: InitResourceContext) -> "LocalFileManager":\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n\n Examples:\n .. code-block:: python\n\n import tempfile\n\n from dagster import job, local_file_manager, op\n\n\n @op(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @op(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @job(resource_defs={"file_manager": local_file_manager})\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @job(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n def files_pipeline():\n read_files(write_files())\n """\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager") # type: ignore # (possible none)\n )\n )
\n\n\ndef check_file_like_obj(obj: object) -> None:\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir: str):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance: DagsterInstance, run_id: str) -> "LocalFileManager":\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self) -> None:\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj: # type: ignore # (??)\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle: LocalFileHandle, mode: str = "rb") -> Iterator[IOStream]:\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n encoding = None if mode == "rb" else "utf8"\n with open(file_handle.path, mode, encoding=encoding) as file_obj:\n yield file_obj # type: ignore # (??)\n\n def read_data(self, file_handle: LocalFileHandle) -> bytes:\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read() # type: ignore # (??)\n\n def write_data(self, data: bytes, ext: Optional[str] = None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(\n self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None\n ) -> LocalFileHandle:\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n encoding = None if "b" in mode else "utf8"\n with open(dest_file_path, mode, encoding=encoding) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj) # type: ignore # (??)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self) -> None:\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/_core/storage/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.file_manager"}, "fs_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.fs_io_manager

\nimport os\nimport pickle\nfrom typing import TYPE_CHECKING, Any, Optional\n\nfrom pydantic import Field\n\nimport dagster._check as check\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Field as DagsterField,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import StringSource\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\nif TYPE_CHECKING:\n    from typing_extensions import Literal\n    from upath import UPath\n\n\n
[docs]class FilesystemIOManager(ConfigurableIOManagerFactory["PickledObjectFilesystemIOManager"]):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, FilesystemIOManager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": FilesystemIOManager()})\n def job():\n op_b(op_a())\n\n """\n\n base_dir: Optional[str] = Field(default=None, description="Base directory for storing files.")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n base_dir = self.base_dir or check.not_none(context.instance).storage_directory()\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=FilesystemIOManager.to_config_schema(),\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n)\ndef fs_io_manager(init_context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, fs_io_manager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n return FilesystemIOManager.from_resource_context(init_context)
\n\n\nclass PickledObjectFilesystemIOManager(UPathIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n Is compatible with local and remote filesystems via `universal-pathlib` and `fsspec`.\n Learn more about how to use remote filesystems here: https://github.com/fsspec/universal_pathlib.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n **kwargs: additional keyword arguments for `universal_pathlib.UPath`.\n """\n\n extension: str = "" # TODO: maybe change this to .pickle? Leaving blank for compatibility.\n\n def __init__(self, base_dir=None, **kwargs):\n from upath import UPath\n\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n\n super().__init__(base_path=UPath(base_dir, **kwargs))\n\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n try:\n with path.open("wb") as file:\n pickle.dump(obj, file, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.job_def.executor_def\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n ) from e\n\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n with path.open("rb") as file:\n return pickle.load(file)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir: Optional[str] = None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode: Literal["wb"] = "wb"\n self.read_mode: Literal["rb"] = "rb"\n\n def _get_path(self, path: str) -> str:\n return os.path.join(self.base_dir, path) # type: ignore # (possible none)\n\n def handle_output(self, context: OutputContext, obj: object):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.job_name, context.step_key, context.name]),\n metadata={"path": MetadataValue.path(os.path.abspath(filepath))},\n )\n\n def load_input(self, context: InputContext) -> object:\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata # type: ignore # (possible none)\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": DagsterField(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(\n init_context: InitResourceContext,\n) -> CustomPathPickledObjectFilesystemIOManager:\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )\n
", "current_page_name": "_modules/dagster/_core/storage/fs_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.fs_io_manager"}, "input_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.input_manager

\nfrom abc import ABC, abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Callable, Optional, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import has_at_least_one_parameter\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition, ResourceFunction\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nInputLoadFn: TypeAlias = Union[\n    Callable[["InputContext"], object],\n    Callable[[], object],\n]\n\n\n
[docs]class InputManager(ABC):\n """Base interface for classes that are responsible for loading solid inputs."""\n\n @abstractmethod\n def load_input(self, context: "InputContext") -> object:\n """The user-defined read method that loads an input to a solid.\n\n Args:\n context (InputContext): The input context.\n\n Returns:\n Any: The data object.\n """
\n\n\nclass IInputManagerDefinition:\n @property\n @abstractmethod\n def input_config_schema(self) -> IDefinitionConfigSchema:\n """The schema for per-input configuration for inputs that are managed by this\n input manager.\n """\n\n\n
[docs]class InputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of an input manager resource.\n\n Input managers load op inputs.\n\n An InputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`InputManager`.\n\n The easiest way to create an InputManagerDefinition is with the\n :py:func:`@input_manager <input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(InputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "InputManagerDefinition":\n return InputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n@overload\ndef input_manager(\n config_schema: InputLoadFn,\n) -> InputManagerDefinition: ...\n\n\n@overload\ndef input_manager(\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[InputLoadFn], InputManagerDefinition]: ...\n\n\n
[docs]def input_manager(\n config_schema: Union[InputLoadFn, Optional[CoercableToConfigSchema]] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[InputManagerDefinition, Callable[[InputLoadFn], InputManagerDefinition]]:\n """Define an input manager.\n\n Input managers load op inputs, either from upstream outputs or by providing default values.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`InputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import input_manager, op, job, In\n\n @input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(input_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n if _is_input_load_fn(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn: InputLoadFn) -> InputManagerDefinition:\n return _InputManagerDecoratorCallable(\n config_schema=cast(CoercableToConfigSchema, config_schema),\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\ndef _is_input_load_fn(obj: Union[InputLoadFn, CoercableToConfigSchema]) -> TypeGuard[InputLoadFn]:\n return callable(obj) and not is_callable_valid_config_arg(obj)\n\n\nclass InputManagerWrapper(InputManager):\n def __init__(self, load_fn: InputLoadFn):\n self._load_fn = load_fn\n\n def load_input(self, context: "InputContext") -> object:\n # the @input_manager decorated function (self._load_fn) may return a direct value that\n # should be used or an instance of an InputManager. So we call self._load_fn and see if the\n # result is an InputManager. If so we call it's load_input method\n intermediate = (\n # type-ignore because function being used as attribute\n self._load_fn(context)\n if has_at_least_one_parameter(self._load_fn)\n else self._load_fn() # type: ignore # (strict type guard)\n )\n\n if isinstance(intermediate, InputManager):\n return intermediate.load_input(context)\n return intermediate\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn: InputLoadFn) -> InputManagerDefinition:\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return InputManagerWrapper(load_fn)\n\n input_manager_def = InputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(input_manager_def, wrapped=load_fn) # type: ignore\n\n return input_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/input_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.input_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Optional, Set, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.storage.input_manager import IInputManagerDefinition, InputManager\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition, OutputManager\n\nfrom ..decorator_utils import get_function_params\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.init import InitResourceContext\n    from dagster._core.execution.context.input import InputContext\n    from dagster._core.execution.context.output import OutputContext\n\nIOManagerFunctionWithContext = Callable[["InitResourceContext"], "IOManager"]\nIOManagerFunction: TypeAlias = Union[\n    IOManagerFunctionWithContext,\n    Callable[[], "IOManager"],\n]\n\n\ndef is_io_manager_context_provided(\n    fn: IOManagerFunction,\n) -> TypeGuard[IOManagerFunctionWithContext]:\n    return len(get_function_params(fn)) >= 1\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn: IOManagerFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n output_config_schema: CoercableToConfigSchema = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any,\n # output_config_schema defaults to None. This the because IOManager input / output config\n # shares config namespace with dagster type loaders.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n @property\n def output_config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self._output_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "IOManagerDefinition":\n io_def = IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n io_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return io_def\n\n
[docs] @public\n @staticmethod\n def hardcoded_io_manager(\n value: "IOManager", description: Optional[str] = None\n ) -> "IOManagerDefinition":\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (IOManager): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @public\n @abstractmethod\n def load_input(self, context: "InputContext") -> Any:\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @public\n @abstractmethod\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n\n@overload\ndef io_manager(config_schema: IOManagerFunction) -> IOManagerDefinition: ...\n\n\n@overload\ndef io_manager(\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[IOManagerFunction], IOManagerDefinition]: ...\n\n\n
[docs]def io_manager(\n config_schema: Union[IOManagerFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Union[IOManagerDefinition, Callable[[IOManagerFunction], IOManagerDefinition],]:\n """Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n config_schema = cast(IOManagerFunction, config_schema)\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: IOManagerFunction) -> IOManagerDefinition:\n return _IOManagerDecoratorCallable(\n config_schema=cast(Optional[UserConfigSchema], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\ndef dagster_maintained_io_manager(io_manager_def: IOManagerDefinition) -> IOManagerDefinition:\n io_manager_def._dagster_maintained = True # noqa: SLF001\n return io_manager_def\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn: IOManagerFunction) -> IOManagerDefinition:\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(io_manager_def, wrapped=fn) # type: ignore\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport shutil\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import IO, TYPE_CHECKING, Generator, Iterator, Mapping, Optional, Sequence, Tuple\n\nfrom typing_extensions import Final\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import (\n    Field,\n    Float,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.execution.compute_logs import mirror_stream_to_file\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._seven import json\nfrom dagster._utils import ensure_dir, ensure_file, touch_file\n\nfrom .captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.cloud_storage_compute_log_manager import LogSubscription\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT: Final = 2.5\n\nIO_TYPE_EXTENSION: Final[Mapping[ComputeIOType, str]] = {\n    ComputeIOType.STDOUT: "out",\n    ComputeIOType.STDERR: "err",\n}\n\nMAX_FILENAME_LENGTH: Final = 255\n\n\n
[docs]class LocalComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(\n self,\n base_dir: str,\n polling_timeout: Optional[float] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def polling_timeout(self) -> float:\n return self._polling_timeout\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "LocalComputeLogManager":\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n outpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT])\n errpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR])\n with mirror_stream_to_file(sys.stdout, outpath), mirror_stream_to_file(sys.stderr, errpath):\n yield CapturedLogContext(log_key)\n\n # leave artifact on filesystem so that we know the capture is completed\n touch_file(self.complete_artifact_path(log_key))\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO]]:\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n with open(path, "+a", encoding="utf-8") as f:\n yield f\n\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n return os.path.exists(self.complete_artifact_path(log_key))\n\n def get_log_data(\n self, log_key: Sequence[str], cursor: Optional[str] = None, max_bytes: Optional[int] = None\n ) -> CapturedLogData:\n stdout_cursor, stderr_cursor = self.parse_cursor(cursor)\n stdout, stdout_offset = self._read_bytes(\n log_key, ComputeIOType.STDOUT, offset=stdout_cursor, max_bytes=max_bytes\n )\n stderr, stderr_offset = self._read_bytes(\n log_key, ComputeIOType.STDERR, offset=stderr_cursor, max_bytes=max_bytes\n )\n return CapturedLogData(\n log_key=log_key,\n stdout=stdout,\n stderr=stderr,\n cursor=self.build_cursor(stdout_offset, stderr_offset),\n )\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata(\n stdout_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]\n ),\n stderr_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]\n ),\n stdout_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDOUT),\n stderr_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDERR),\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n if log_key:\n paths = [\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n self.get_captured_local_path(log_key, "complete"),\n ]\n for path in paths:\n if os.path.exists(path) and os.path.isfile(path):\n os.remove(path)\n elif prefix:\n dir_to_delete = os.path.join(self._base_dir, *prefix)\n if os.path.exists(dir_to_delete) and os.path.isdir(dir_to_delete):\n # recursively delete all files in dir\n shutil.rmtree(dir_to_delete)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def _read_bytes(\n self,\n log_key: Sequence[str],\n io_type: ComputeIOType,\n offset: Optional[int] = 0,\n max_bytes: Optional[int] = None,\n ):\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n return self.read_path(path, offset or 0, max_bytes)\n\n def parse_cursor(self, cursor: Optional[str] = None) -> Tuple[int, int]:\n # Translates a string cursor into a set of byte offsets for stdout, stderr\n if not cursor:\n return 0, 0\n\n parts = cursor.split(":")\n if not parts or len(parts) != 2:\n return 0, 0\n\n stdout, stderr = [int(_) for _ in parts]\n return stdout, stderr\n\n def build_cursor(self, stdout_offset: int, stderr_offset: int) -> str:\n return f"{stdout_offset}:{stderr_offset}"\n\n def complete_artifact_path(self, log_key):\n return self.get_captured_local_path(log_key, "complete")\n\n def read_path(\n self,\n path: str,\n offset: int = 0,\n max_bytes: Optional[int] = None,\n ):\n if not os.path.exists(path) or not os.path.isfile(path):\n return None, offset\n\n with open(path, "rb") as f:\n f.seek(offset, os.SEEK_SET)\n if max_bytes is None:\n data = f.read()\n else:\n data = f.read(max_bytes)\n new_offset = f.tell()\n return data, new_offset\n\n def get_captured_log_download_url(self, log_key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n url = "/logs"\n for part in log_key:\n url = f"{url}/{part}"\n\n return f"{url}/{IO_TYPE_EXTENSION[io_type]}"\n\n def get_captured_local_path(self, log_key: Sequence[str], extension: str, partial=False):\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(filebase.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._base_dir, *namespace, filename)\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n subscription = CapturedLogSubscription(self, log_key, cursor)\n self.on_subscribe(subscription)\n return subscription\n\n def unsubscribe(self, subscription):\n self.on_unsubscribe(subscription)\n\n ###############################################\n #\n # Methods for the ComputeLogManager interface\n #\n ###############################################\n @contextmanager\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n with self.capture_logs(log_key):\n yield\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Legacy adapter from compute log manager to more generic captured log manager API."""\n check.inst_param(io_type, "io_type", ComputeIOType)\n log_key = self.build_log_key_for_run(run_id, key)\n return self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def get_key(self, dagster_run: DagsterRun, step_key: Optional[str]):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or dagster_run.job_name\n\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n log_key = self.build_log_key_for_run(run_id, key)\n return self.is_capture_complete(log_key)\n\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]):\n pass\n\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str] = None):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n touchpath = self.complete_artifact_path(log_key)\n touch_file(touchpath)\n\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return f"/download/{run_id}/{key}/{io_type.value}"\n\n def on_subscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self) -> None:\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def add_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if self.is_complete(subscription):\n subscription.fetch()\n subscription.complete()\n else:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription)\n\n def is_complete(self, subscription: "LogSubscription") -> bool:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.is_watch_completed(subscription.run_id, subscription.key)\n return self._manager.is_capture_complete(subscription.log_key)\n\n def remove_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def _log_key(self, subscription: "LogSubscription") -> Sequence[str]:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.build_log_key_for_run(subscription.run_id, subscription.key)\n return subscription.log_key\n\n def _watch_key(self, log_key: Sequence[str]) -> str:\n return json.dumps(log_key)\n\n def remove_all_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, subscription: "LogSubscription") -> None:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n ]\n complete_paths = [self._manager.complete_artifact_path(log_key)]\n directory = os.path.dirname(\n self._manager.get_captured_local_path(log_key, ComputeIOType.STDERR),\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(self, log_key, update_paths, complete_paths),\n str(directory),\n )\n\n def notify_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, log_key: Sequence[str], handler) -> None:\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key]) # type: ignore\n del self._watchers[watch_key]\n\n def dispose(self) -> None:\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, log_key, update_paths, complete_paths):\n self.manager = manager\n self.log_key = log_key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.log_key)\n self.manager.unwatch(self.log_key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.log_key)\n
", "current_page_name": "_modules/dagster/_core/storage/local_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.mem_io_manager

\nfrom typing import Dict, Tuple\n\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\n\n\n
[docs]class InMemoryIOManager(IOManager):\n """I/O manager that stores and retrieves values in memory. After execution is complete, the values will\n be garbage-collected. Note that this means that each run will not have access to values from previous runs.\n """\n\n def __init__(self):\n self.values: Dict[Tuple[object, ...], object] = {}\n\n def handle_output(self, context: OutputContext, obj: object):\n keys = tuple(context.get_identifier())\n self.values[keys] = obj\n\n def load_input(self, context: InputContext) -> object:\n keys = tuple(context.get_identifier())\n return self.values[keys]
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(description="Built-in IO manager that stores and retrieves values in memory.")\ndef mem_io_manager(_) -> InMemoryIOManager:\n """Built-in IO manager that stores and retrieves values in memory."""\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/_core/storage/mem_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @public\n @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n output_context: OutputContext\n\n if isinstance(context, OutputContext):\n output_context = context\n else:\n if context.upstream_output is None:\n raise DagsterInvariantViolationError(\n "Missing value of InputContext.upstream_output. Cannot compute the input path."\n )\n\n output_context = context.upstream_output\n\n # automatically construct filepath\n step_key = check.str_param(output_context.step_key, "context.step_key")\n output_name = check.str_param(output_context.name, "context.name")\n version = check.str_param(output_context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/_core/storage/memoizable_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.memoizable_io_manager"}, "noop_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.noop_compute_log_manager

\nfrom contextlib import contextmanager\nfrom typing import IO, Any, Generator, Mapping, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.storage.captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\n\n\n
[docs]class NoOpComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """When enabled for a Dagster instance, stdout and stderr will not be available for any step."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return NoOpComputeLogManager(inst_data=inst_data, **config_value)\n\n def enabled(self, _dagster_run, _step_key):\n return False\n\n def _watch_logs(self, dagster_run, step_key=None):\n pass\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n raise NotImplementedError()\n\n def is_watch_completed(self, run_id, key):\n return True\n\n def on_watch_start(self, dagster_run, step_key):\n pass\n\n def on_watch_finish(self, dagster_run, step_key):\n pass\n\n def download_url(self, run_id, key, io_type):\n return None\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n return ComputeLogFileData(\n path=f"{key}.{io_type}", data=None, cursor=0, size=0, download_url=None\n )\n\n def on_subscribe(self, subscription):\n pass\n\n def on_unsubscribe(self, subscription):\n pass\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n yield CapturedLogContext(log_key=log_key)\n\n def is_capture_complete(self, log_key: Sequence[str]):\n return True\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Generator[Optional[IO], None, None]:\n yield None\n\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n return CapturedLogData(log_key=log_key)\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata()\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n pass\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n return CapturedLogSubscription(self, log_key, cursor)\n\n def unsubscribe(self, subscription: CapturedLogSubscription):\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/noop_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.noop_compute_log_manager"}, "root": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.root

\nimport os\nfrom tempfile import TemporaryDirectory\nfrom typing import Optional\n\nfrom typing_extensions import TypedDict\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\n\nclass LocalArtifactStorageConfig(TypedDict):\n    base_dir: str\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def base_dir(self) -> str:\n return self._base_dir\n\n def file_manager_dir(self, run_id: str) -> str:\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self) -> str:\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self) -> str:\n return os.path.join(self.base_dir, "schedules")\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: LocalArtifactStorageConfig\n ) -> "LocalArtifactStorage":\n return LocalArtifactStorage(inst_data=inst_data, **config_value)\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n def dispose(self):\n pass
\n\n\nclass TemporaryLocalArtifactStorage(LocalArtifactStorage):\n """Used by ephemeral DagsterInstances, defers directory creation til\n access since many uses of ephemeral instance do not require artifact directory.\n """\n\n def __init__(self):\n self._temp_dir = None\n\n @property\n def base_dir(self):\n if self._temp_dir is None:\n self._temp_dir = TemporaryDirectory()\n return self._temp_dir.name\n\n def dispose(self):\n if self._temp_dir:\n self._temp_dir.cleanup()\n
", "current_page_name": "_modules/dagster/_core/storage/root", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.root"}, "runs": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Mapping, Optional, Sequence, Set, Tuple, Union\n\nfrom typing_extensions import TypedDict\n\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\nfrom dagster._core.storage.dagster_run import (\n    DagsterRun,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._utils import PrintFn\n\nfrom ..daemon_cursor import DaemonCursorStorage\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\nclass RunGroupInfo(TypedDict):\n    count: int\n    runs: Sequence[DagsterRun]\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance], DaemonCursorStorage):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n dagster_run (DagsterRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n """Update run storage in accordance to a pipeline run related DagsterEvent.\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n """Return all the run IDs for runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Sequence[str]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Args:\n tag_keys (Optional[Sequence[str]]): tag keys to filter by.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def get_run_tag_keys(self) -> Sequence[str]:\n """Get a list of tag keys.\n\n Returns:\n List[str]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[JobSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ) -> None:\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, JobSnapshot):\n self.add_job_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_job_snapshot(snapshot_id) or self.has_execution_plan_snapshot(snapshot_id)\n\n @abstractmethod\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n job_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The job_snapshot_id\n """\n\n @abstractmethod\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n job_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self) -> None:\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str) -> None:\n """Remove a run from storage."""\n\n @property\n def supports_bucket_queries(self) -> bool:\n return False\n\n @abstractmethod\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n """Called on a regular interval by the daemon."""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types."""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self) -> None:\n """Wipe all daemon heartbeats."""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n """Get a list of partition backfills."""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage."""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @abstractmethod\n def replace_job_origin(self, run: "DagsterRun", job_origin: "ExternalJobOrigin") -> None: ...
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster._core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.host_representation.origin import ExternalJobOrigin\nfrom dagster._core.snap import (\n    ExecutionPlanSnapshot,\n    JobSnapshot,\n    create_execution_plan_snapshot_id,\n    create_job_snapshot_id,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_fetch_mappings,\n    db_scalar_subquery,\n    db_select,\n    db_subquery,\n)\nfrom dagster._core.storage.tags import (\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    REPOSITORY_LABEL_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..dagster_run import (\n    DagsterRun,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom .base import RunStorage\nfrom .migration import (\n    OPTIONAL_DATA_MIGRATIONS,\n    REQUIRED_DATA_MIGRATIONS,\n    RUN_PARTITIONS,\n    MigrationFn,\n)\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    KeyValueStoreTable,\n    RunsTable,\n    RunTagsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage):\n """Base class for SQL based run storages."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query: SqlAlchemyQuery) -> Sequence[Any]:\n with self.connect() as conn:\n return db_fetch_mappings(conn, query)\n\n def fetchone(self, query: SqlAlchemyQuery) -> Optional[Any]:\n with self.connect() as conn:\n if db.__version__.startswith("2."):\n return conn.execute(query).mappings().first()\n else:\n return conn.execute(query).fetchone()\n\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n if dagster_run.job_snapshot_id and not self.has_job_snapshot(dagster_run.job_snapshot_id):\n raise DagsterSnapshotDoesNotExist(\n f"Snapshot {dagster_run.job_snapshot_id} does not exist in run storage"\n )\n\n has_tags = dagster_run.tags and len(dagster_run.tags) > 0\n partition = dagster_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = dagster_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values(\n run_id=dagster_run.run_id,\n pipeline_name=dagster_run.job_name,\n status=dagster_run.status.value,\n run_body=serialize_value(dagster_run),\n snapshot_id=dagster_run.job_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db_exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n tags_to_insert = dagster_run.tags_for_storage()\n if tags_to_insert:\n conn.execute(\n RunTagsTable.insert(),\n [\n dict(run_id=dagster_run.run_id, key=k, value=v)\n for k, v in tags_to_insert.items()\n ],\n )\n\n return dagster_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self._get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_job_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_status(new_job_status)),\n status=new_job_status.value,\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Dict) -> DagsterRun:\n run = deserialize_value(row["run_body"], DagsterRun)\n status = DagsterRunStatus(row["status"])\n # NOTE: the status column is more trustworthy than the status in the run body, since concurrent\n # writes (e.g. handle_run_event and add_tags) can cause the status in the body to be out of\n # overriden with an old value.\n return run.with_status(status)\n\n def _rows_to_runs(self, rows: Iterable[Dict]) -> Sequence[DagsterRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query: SqlAlchemyQuery,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ) -> SqlAlchemyQuery:\n """Helper function to deal with cursor/limit pagination args."""\n if cursor:\n cursor_query = db_select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < db_scalar_subquery(cursor_query))\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def _add_filters_to_query(self, query: SqlAlchemyQuery, filters: RunsFilter) -> SqlAlchemyQuery:\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.updated_before:\n query = query.where(RunsTable.c.update_timestamp < filters.updated_before)\n\n if filters.created_after:\n query = query.where(RunsTable.c.create_timestamp > filters.created_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[Sequence[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> SqlAlchemyQuery:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body", "status"]\n\n if filters.tags:\n table = self._apply_tags_table_joins(RunsTable, filters.tags)\n else:\n table = RunsTable\n\n base_query = db_select([getattr(RunsTable.c, column) for column in columns]).select_from(\n table\n )\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n ) -> db.Table:\n multi_join = len(tags) > 1\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = (\n db_subquery(db_select([RunTagsTable]), f"run_tags_subquery_{i}")\n if multi_join\n else RunTagsTable\n )\n table = table.join(\n tags_table,\n db.and_(\n RunsTable.c.run_id == tags_table.c.run_id,\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n query = self._runs_query(filters=filters, cursor=cursor, limit=limit, columns=["run_id"])\n rows = self.fetchall(query)\n return [row["run_id"] for row in rows]\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = db_subquery(self._runs_query(filters=filters))\n query = db_select([db.func.count().label("count")]).select_from(subquery)\n row = self.fetchone(query)\n count = row["count"] if row else 0\n return count\n\n def _get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n check.str_param(run_id, "run_id")\n\n query = db_select([RunsTable.c.run_body, RunsTable.c.status]).where(\n RunsTable.c.run_id == run_id\n )\n rows = self.fetchall(query)\n return self._row_to_run(rows[0]) if rows else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "status", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n dagster_run=self._row_to_run(row),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=(\n check.opt_inst(row["start_time"], float) if "start_time" in row else None\n ),\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = (\n db_select([RunTagsTable.c.key, RunTagsTable.c.value])\n .distinct()\n .order_by(RunTagsTable.c.key, RunTagsTable.c.value)\n )\n if tag_keys:\n query = query.where(RunTagsTable.c.key.in_(tag_keys))\n if value_prefix:\n query = query.where(RunTagsTable.c.value.startswith(value_prefix))\n if limit:\n query = query.limit(limit)\n rows = self.fetchall(query)\n for r in rows:\n result[r["key"]].add(r["value"])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def get_run_tag_keys(self) -> Sequence[str]:\n query = db_select([RunTagsTable.c.key]).distinct().order_by(RunTagsTable.c.key)\n rows = self.fetchall(query)\n return sorted([r["key"] for r in rows])\n\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n check.str_param(run_id, "run_id")\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self._get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_tags(merge_dicts(current_tags, new_tags))),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update()\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(),\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Tuple[str, Sequence[DagsterRun]]:\n check.str_param(run_id, "run_id")\n dagster_run = self._get_run_by_id(run_id)\n if not dagster_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = dagster_run.root_run_id if dagster_run.root_run_id else dagster_run.run_id\n root_run = self._get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run_id} set as root run id for run {run_id} was not found in"\n " instance.",\n invalid_run_id=root_run_id,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = db_subquery(\n db_select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n ).where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n ),\n "root_to_run",\n )\n # get run group\n run_group_query = db_select([RunsTable.c.run_body, RunsTable.c.status]).select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n\n res = self.fetchall(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run, *run_group])\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self._get_run_by_id(run_id))\n\n def delete_run(self, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._has_snapshot_id(job_snapshot_id)\n\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_job_snapshot_id(job_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=job_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._get_snapshot(job_snapshot_id) # type: ignore # (allowed to return None?)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id) # type: ignore # (allowed to return None?)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = SnapshotsTable.insert().values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n try:\n conn.execute(snapshot_insert)\n except db_exc.IntegrityError:\n # on_conflict_do_nothing equivalent\n pass\n\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db_select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row["run_storage_id"]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db_select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str) -> Optional[JobSnapshot]:\n query = db_select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_execution_plan_snapshot_query(logging, [row["snapshot_body"]]) if row else None # type: ignore\n\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n if self.has_built_index(RUN_PARTITIONS) and self.has_run_stats_index_cols():\n query = self._runs_query(\n filters=runs_filter,\n columns=["run_id", "status", "start_time", "end_time", "partition"],\n )\n rows = self.fetchall(query)\n\n # dedup by partition\n _partition_data_by_partition = {}\n for row in rows:\n if not row["partition"] or row["partition"] in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[row["partition"]] = RunPartitionData(\n run_id=row["run_id"],\n partition=row["partition"],\n status=DagsterRunStatus[row["status"]],\n start_time=row["start_time"],\n end_time=row["end_time"],\n )\n\n return list(_partition_data_by_partition.values())\n else:\n query = self._runs_query(filters=runs_filter)\n rows = self.fetchall(query)\n _partition_data_by_partition = {}\n for row in rows:\n run = self._row_to_run(row)\n partition = run.tags.get(PARTITION_NAME_TAG)\n if not partition or partition in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[partition] = RunPartitionData(\n run_id=run.run_id,\n partition=partition,\n status=run.status,\n start_time=None,\n end_time=None,\n )\n\n return list(_partition_data_by_partition.values())\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> Sequence[DagsterRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[[], MigrationFn]],\n print_fn: Optional[PrintFn] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n results = self.fetchall(query)\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n def has_bulk_actions_selector_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [\n x.get("name") for x in db.inspect(conn).get_columns(BulkActionsTable.name)\n ]\n return "selector_id" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update()\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n rows = self.fetchall(db_select([DaemonHeartbeatsTable.c.body]))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_value(row["body"], DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self) -> None:\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete())\n conn.execute(RunTagsTable.delete())\n conn.execute(SnapshotsTable.delete())\n conn.execute(DaemonHeartbeatsTable.delete())\n conn.execute(BulkActionsTable.delete())\n\n def wipe_daemon_heartbeats(self) -> None:\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete())\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db_select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db_select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_value(row["body"], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db_select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_value(row["body"], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n values: Dict[str, Any] = dict(\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_value(cast(NamedTuple, partition_backfill)),\n )\n\n if self.has_bulk_actions_selector_cols():\n values["selector_id"] = partition_backfill.selector_id\n values["action_type"] = partition_backfill.bulk_action_type.value\n\n with self.connect() as conn:\n conn.execute(BulkActionsTable.insert().values(**values))\n\n def update_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update()\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_value(partition_backfill),\n )\n )\n\n def get_cursor_values(self, keys: Set[str]) -> Mapping[str, str]:\n check.set_param(keys, "keys", of_type=str)\n\n rows = self.fetchall(\n db_select([KeyValueStoreTable.c.key, KeyValueStoreTable.c.value]).where(\n KeyValueStoreTable.c.key.in_(keys)\n ),\n )\n return {row["key"]: row["value"] for row in rows}\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n try:\n conn.execute(KeyValueStoreTable.insert().values(db_values))\n except db_exc.IntegrityError:\n conn.execute(\n KeyValueStoreTable.update()\n .where(KeyValueStoreTable.c.key.in_(pairs.keys()))\n .values(value=db.sql.case(pairs, value=KeyValueStoreTable.c.key))\n )\n\n # Migrating run history\n def replace_job_origin(self, run: DagsterRun, job_origin: ExternalJobOrigin) -> None:\n new_label = job_origin.external_repository_origin.get_label()\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run.run_id)\n .values(\n run_body=serialize_value(run.with_job_origin(job_origin)),\n )\n )\n conn.execute(\n RunTagsTable.update()\n .where(RunTagsTable.c.run_id == run.run_id)\n .where(RunTagsTable.c.key == REPOSITORY_LABEL_TAG)\n .values(value=new_label)\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_execution_plan_snapshot_query(\n logger: logging.Logger, row: Sequence[Any]\n) -> Optional[Union[ExecutionPlanSnapshot, JobSnapshot]]:\n # minimal checking here because sqlalchemy returns a different type based on what version of\n # SqlAlchemy you are using\n\n def _warn(msg: str) -> None:\n logger.warning(f"get-pipeline-snapshot: {msg}")\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_value(decoded_str, (ExecutionPlanSnapshot, JobSnapshot))\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sql_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, Optional\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunsTable, RunStorageSqlMetadata, RunTagsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteRunStorage":\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None) -> Self:\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def _alembic_upgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n def upgrade(self) -> None:\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self) -> None:\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id: str) -> None:\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes.\n """\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.base

\nimport abc\nfrom typing import Mapping, Optional, Sequence, Set\n\nfrom dagster import AssetKey\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._utils import PrintFn\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract class for managing persistance of scheduler artifacts."""\n\n @abc.abstractmethod\n def wipe(self) -> None:\n """Delete all schedules from storage."""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n """Return all InstigationStates present in storage.\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n instigator_statuses (Optional[Set[InstigatorStatus]]): The InstigatorStatuses to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n """Return the instigator state for the given id.\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self) -> bool:\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n before (datetime): All ticks before this datetime will get purged\n tick_statuses (Optional[List[TickStatus]]): The tick statuses to wipe\n """\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n return True\n\n @abc.abstractmethod\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ) -> None:\n """Add asset policy evaluations to storage."""\n\n @abc.abstractmethod\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get the policy evaluations for a given asset.\n\n Args:\n asset_key (AssetKey): The asset key to query\n limit (Optional[int]): The maximum number of evaluations to return\n cursor (Optional[int]): The cursor to paginate from\n """\n\n @abc.abstractmethod\n def get_auto_materialize_evaluations_for_evaluation_id(\n self, evaluation_id: int\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get all policy evaluations for a given evaluation ID.\n\n Args:\n evaluation_id (int): The evaluation ID to query.\n """\n\n @abc.abstractmethod\n def purge_asset_evaluations(self, before: float) -> None:\n """Wipe evaluations before a certain timestamp.\n\n Args:\n before (datetime): All evaluations before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self) -> None:\n """Perform any needed migrations."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    TypeVar,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import db_fetch_mappings, db_select, db_subquery\nfrom dagster._serdes import serialize_value\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import (\n    AssetDaemonAssetEvaluationsTable,\n    InstigatorsTable,\n    JobTable,\n    JobTickTable,\n    SecondaryIndexMigrationTable,\n)\n\nT_NamedTuple = TypeVar("T_NamedTuple", bound=NamedTuple)\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query: SqlAlchemyQuery) -> Sequence[SqlAlchemyRow]:\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(\n self, rows: Sequence[SqlAlchemyRow], as_type: Type[T_NamedTuple]\n ) -> Sequence[T_NamedTuple]:\n return list(map(lambda r: deserialize_value(r[0], as_type), rows))\n\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db_select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n InstigatorsTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n else:\n query = db_select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n JobTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows, InstigatorState)\n\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db_select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1], InstigatorState)[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id: str) -> bool:\n check.str_param(selector_id, "selector_id")\n\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None # type: ignore\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values(\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is not present in storage"\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update()\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {origin_id} is not present in storage"\n )\n\n with self.connect() as conn:\n conn.execute(JobTable.delete().where(JobTable.c.job_origin_id == origin_id))\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where(\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn: Connection, selector_id: str) -> bool:\n query = (\n db_select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0 # type: ignore # (possible none)\n\n def _add_filter_limit(\n self,\n query: SqlAlchemyQuery,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses=None,\n ) -> SqlAlchemyQuery:\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self) -> bool:\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self) -> bool:\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def _has_asset_daemon_asset_evaluations_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "asset_daemon_asset_evaluations" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n check.sequence_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = db_subquery(\n db_select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db_select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = deserialize_value(row[2], TickData)\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db_select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(map(lambda r: InstigatorTick(r[0], deserialize_value(r[1], TickData)), rows))\n\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_value(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(**values)\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in"\n " storage"\n ) from exc\n\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_value(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update().where(JobTickTable.c.id == tick.tick_id).values(**values)\n )\n\n return tick\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n check.str_param(origin_id, "origin_id")\n check.float_param(before, "before")\n check.opt_list_param(tick_statuses, "tick_statuses", of_type=TickStatus)\n\n utc_before = utc_datetime_from_timestamp(before)\n\n query = JobTickTable.delete().where(JobTickTable.c.timestamp < utc_before)\n if tick_statuses:\n query = query.where(\n JobTickTable.c.status.in_([tick_status.value for tick_status in tick_statuses])\n )\n\n if self.has_instigators_table():\n query = query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n with self.connect() as conn:\n return self._has_asset_daemon_asset_evaluations_table(conn)\n\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ):\n if not asset_evaluations:\n return\n\n with self.connect() as conn:\n bulk_insert = AssetDaemonAssetEvaluationsTable.insert().values(\n [\n {\n "evaluation_id": evaluation_id,\n "asset_key": evaluation.asset_key.to_string(),\n "asset_evaluation_body": serialize_value(evaluation),\n "num_requested": evaluation.num_requested,\n "num_skipped": evaluation.num_skipped,\n "num_discarded": evaluation.num_discarded,\n }\n for evaluation in asset_evaluations\n ]\n )\n conn.execute(bulk_insert)\n\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = (\n db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n AssetDaemonAssetEvaluationsTable.c.asset_key,\n ]\n )\n .where(AssetDaemonAssetEvaluationsTable.c.asset_key == asset_key.to_string())\n .order_by(AssetDaemonAssetEvaluationsTable.c.evaluation_id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetDaemonAssetEvaluationsTable.c.evaluation_id < cursor)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def get_auto_materialize_evaluations_for_evaluation_id(\n self, evaluation_id: int\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n AssetDaemonAssetEvaluationsTable.c.asset_key,\n ]\n ).where(AssetDaemonAssetEvaluationsTable.c.evaluation_id == evaluation_id)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def purge_asset_evaluations(self, before: float):\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n query = AssetDaemonAssetEvaluationsTable.delete().where(\n AssetDaemonAssetEvaluationsTable.c.create_timestamp < utc_before\n )\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self) -> None:\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete())\n conn.execute(JobTickTable.delete())\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n if self._has_asset_daemon_asset_evaluations_table(conn):\n conn.execute(AssetDaemonAssetEvaluationsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self) -> bool:\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[..., Any]],\n print_fn: Optional[Callable] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sql_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\nfrom typing import Iterator, Optional\n\nimport sqlalchemy as db\nfrom packaging.version import parse\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage."""\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "SqliteScheduleStorage":\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(\n cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None\n ) -> "SqliteScheduleStorage":\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n @property\n def supports_batch_queries(self) -> bool:\n if not super().supports_batch_queries:\n return False\n\n return super().supports_batch_queries and parse(get_sqlite_version()) >= parse(\n MINIMUM_SQLITE_BATCH_VERSION\n )\n\n def upgrade(self) -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sqlite.sqlite_schedule_storage"}}}, "upath_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.upath_io_manager

\nimport asyncio\nimport inspect\nfrom abc import abstractmethod\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union\n\nfrom fsspec import AbstractFileSystem\nfrom fsspec.implementations.local import LocalFileSystem\n\nfrom dagster import (\n    InputContext,\n    MetadataValue,\n    MultiPartitionKey,\n    OutputContext,\n    _check as check,\n)\nfrom dagster._core.storage.memoizable_io_manager import MemoizableIOManager\n\nif TYPE_CHECKING:\n    from upath import UPath\n\n\n
[docs]class UPathIOManager(MemoizableIOManager):\n """Abstract IOManager base class compatible with local and cloud storage via `universal-pathlib` and `fsspec`.\n\n Features:\n - handles partitioned assets\n - handles loading a single upstream partition\n - handles loading multiple upstream partitions (with respect to :py:class:`PartitionMapping`)\n - supports loading multiple partitions concurrently with async `load_from_path` method\n - the `get_metadata` method can be customized to add additional metadata to the output\n - the `allow_missing_partitions` metadata value can be set to `True` to skip missing partitions\n (the default behavior is to raise an error)\n\n """\n\n extension: Optional[str] = None # override in child class\n\n def __init__(\n self,\n base_path: Optional["UPath"] = None,\n ):\n from upath import UPath\n\n assert not self.extension or "." in self.extension\n self._base_path = base_path or UPath(".")\n\n @abstractmethod\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n """Child classes should override this method to write the object to the filesystem."""\n\n @abstractmethod\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n """Child classes should override this method to load the object from the filesystem."""\n\n @property\n def fs(self) -> AbstractFileSystem:\n """Utility function to get the IOManager filesystem.\n\n Returns:\n AbstractFileSystem: fsspec filesystem.\n\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path.fs\n elif isinstance(self._base_path, Path):\n return LocalFileSystem()\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n @property\n def storage_options(self) -> Dict[str, Any]:\n """Utility function to get the fsspec storage_options which are often consumed by various I/O functions.\n\n Returns:\n Dict[str, Any]: fsspec storage_options.\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path._kwargs.copy() # noqa\n elif isinstance(self._base_path, Path):\n return {}\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n def get_metadata(\n self,\n context: OutputContext,\n obj: Any,\n ) -> Dict[str, MetadataValue]:\n """Child classes should override this method to add custom metadata to the outputs."""\n return {}\n\n # Read/write operations on paths can generally be handled by methods on the\n # UPath class, but when the backend requires credentials, this isn't\n # always possible. Override these path_* methods to provide custom\n # implementations for targeting backends that require authentication.\n\n def unlink(self, path: "UPath") -> None:\n """Remove the file or object at the provided path."""\n path.unlink()\n\n def path_exists(self, path: "UPath") -> bool:\n """Check if a file or object exists at the provided path."""\n return path.exists()\n\n def make_directory(self, path: "UPath"):\n """Create a directory at the provided path.\n\n Override as a no-op if the target backend doesn't use directories.\n """\n path.mkdir(parents=True, exist_ok=True)\n\n def has_output(self, context: OutputContext) -> bool:\n return self.path_exists(self._get_path(context))\n\n def _with_extension(self, path: "UPath") -> "UPath":\n return path.with_suffix(path.suffix + self.extension) if self.extension else path\n\n def _get_path_without_extension(self, context: Union[InputContext, OutputContext]) -> "UPath":\n if context.has_asset_key:\n context_path = self.get_asset_relative_path(context)\n else:\n # we are dealing with an op output\n context_path = self.get_op_output_relative_path(context)\n\n return self._base_path.joinpath(context_path)\n\n def get_asset_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n # we are not using context.get_asset_identifier() because it already includes the partition_key\n return UPath(*context.asset_key.path)\n\n def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n return UPath(*context.get_identifier())\n\n def get_loading_input_log_message(self, path: "UPath") -> str:\n return f"Loading file from: {path} using {self.__class__.__name__}..."\n\n def get_writing_output_log_message(self, path: "UPath") -> str:\n return f"Writing file at: {path} using {self.__class__.__name__}..."\n\n def get_loading_input_partition_log_message(self, path: "UPath", partition_key: str) -> str:\n return f"Loading partition {partition_key} from {path} using {self.__class__.__name__}..."\n\n def get_missing_partition_log_message(self, partition_key: str) -> str:\n return (\n f"Couldn't load partition {partition_key} and skipped it "\n "because the input metadata includes allow_missing_partitions=True"\n )\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n """Returns the I/O path for a given context.\n Should not be used with partitions (use `_get_paths_for_partitions` instead).\n """\n path = self._get_path_without_extension(context)\n return self._with_extension(path)\n\n def get_path_for_partition(\n self, context: Union[InputContext, OutputContext], path: "UPath", partition: str\n ) -> "UPath":\n """Override this method if you want to use a different partitioning scheme\n (for example, if the saving function handles partitioning instead).\n The extension will be added later.\n\n Args:\n context (Union[InputContext, OutputContext]): The context for the I/O operation.\n path (UPath): The path to the file or object.\n partition (str): Formatted partition/multipartition key\n\n Returns:\n UPath: The path to the file with the partition key appended.\n """\n return path / partition\n\n def _get_paths_for_partitions(\n self, context: Union[InputContext, OutputContext]\n ) -> Dict[str, "UPath"]:\n """Returns a dict of partition_keys into I/O paths for a given context."""\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str:\n ordered_dimension_keys = [\n key[1]\n for key in sorted(partition_key.keys_by_dimension.items(), key=lambda x: x[0])\n ]\n return "/".join(ordered_dimension_keys)\n\n formatted_partition_keys = {\n partition_key: (\n _formatted_multipartitioned_path(partition_key)\n if isinstance(partition_key, MultiPartitionKey)\n else partition_key\n )\n for partition_key in context.asset_partition_keys\n }\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(\n self.get_path_for_partition(context, asset_path, partition)\n )\n for partition_key, partition in formatted_partition_keys.items()\n }\n\n def _get_multipartition_backcompat_paths(\n self, context: Union[InputContext, OutputContext]\n ) -> Mapping[str, "UPath"]:\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n partition_keys = context.asset_partition_keys\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(asset_path / partition_key)\n for partition_key in partition_keys\n if isinstance(partition_key, MultiPartitionKey)\n }\n\n def _load_single_input(\n self, path: "UPath", context: InputContext, backcompat_path: Optional["UPath"] = None\n ) -> Any:\n context.log.debug(self.get_loading_input_log_message(path))\n try:\n obj = self.load_from_path(context=context, path=path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=backcompat_path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n except FileNotFoundError:\n raise e\n else:\n raise e\n\n context.add_input_metadata({"path": MetadataValue.path(str(path))})\n return obj\n\n def _load_partition_from_path(\n self,\n context: InputContext,\n partition_key: str,\n path: "UPath",\n backcompat_path: Optional["UPath"] = None,\n ) -> Any:\n """1. Try to load the partition from the normal path.\n 2. If it was not found, try to load it from the backcompat path.\n 3. If allow_missing_partitions metadata is True, skip the partition if it was not found in any of the paths.\n Otherwise, raise an error.\n\n Args:\n context (InputContext): IOManager Input context\n partition_key (str): the partition key corresponding to the partition being loaded\n path (UPath): The path to the partition.\n backcompat_path (Optional[UPath]): The path to the partition in the backcompat location.\n\n Returns:\n Any: The object loaded from the partition.\n """\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n try:\n context.log.debug(self.get_loading_input_partition_log_message(path, partition_key))\n obj = self.load_from_path(context=context, path=path)\n return obj\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=path)\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n return obj\n except FileNotFoundError as e:\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n\n def _load_multiple_inputs(self, context: InputContext) -> Dict[str, Any]:\n # load multiple partitions\n paths = self._get_paths_for_partitions(context) # paths for normal partitions\n backcompat_paths = self._get_multipartition_backcompat_paths(\n context\n ) # paths for multipartitions\n\n context.log.debug(f"Loading {len(paths)} partitions...")\n\n objs = {}\n\n if not inspect.iscoroutinefunction(self.load_from_path):\n for partition_key in context.asset_partition_keys:\n obj = self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n if obj is not None: # in case some partitions were skipped\n objs[partition_key] = obj\n return objs\n else:\n # load_from_path returns a coroutine, so we need to await the results\n\n async def collect():\n loop = asyncio.get_running_loop()\n\n tasks = []\n\n for partition_key in context.asset_partition_keys:\n tasks.append(\n loop.create_task(\n self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n )\n )\n\n results = await asyncio.gather(*tasks, return_exceptions=True)\n\n # need to handle missing partitions here because exceptions don't get propagated from async calls\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n results_without_errors = []\n found_errors = False\n for partition_key, result in zip(context.asset_partition_keys, results):\n if isinstance(result, FileNotFoundError):\n if allow_missing_partitions:\n context.log.warning(\n self.get_missing_partition_log_message(partition_key)\n )\n else:\n context.log.error(str(result))\n found_errors = True\n elif isinstance(result, Exception):\n context.log.error(str(result))\n found_errors = True\n else:\n results_without_errors.append(result)\n\n if found_errors:\n raise RuntimeError(\n f"{len(paths) - len(results_without_errors)} partitions could not be loaded"\n )\n\n return results_without_errors\n\n awaited_objects = asyncio.get_event_loop().run_until_complete(collect())\n\n return {\n partition_key: awaited_object\n for partition_key, awaited_object in zip(\n context.asset_partition_keys, awaited_objects\n )\n if awaited_object is not None\n }\n\n def load_input(self, context: InputContext) -> Union[Any, Dict[str, Any]]:\n # If no asset key, we are dealing with an op output which is always non-partitioned\n if not context.has_asset_key or not context.has_asset_partitions:\n path = self._get_path(context)\n return self._load_single_input(path, context)\n else:\n asset_partition_keys = context.asset_partition_keys\n if len(asset_partition_keys) == 0:\n return None\n elif len(asset_partition_keys) == 1:\n paths = self._get_paths_for_partitions(context)\n check.invariant(len(paths) == 1, f"Expected 1 path, but got {len(paths)}")\n path = next(iter(paths.values()))\n backcompat_paths = self._get_multipartition_backcompat_paths(context)\n backcompat_path = (\n None if not backcompat_paths else next(iter(backcompat_paths.values()))\n )\n\n return self._load_single_input(path, context, backcompat_path)\n else: # we are dealing with multiple partitions of an asset\n type_annotation = context.dagster_type.typing_type\n if type_annotation != Any and not is_dict_type(type_annotation):\n check.failed(\n "Loading an input that corresponds to multiple partitions, but the"\n " type annotation on the op input is not a dict, Dict, Mapping, or"\n f" Any: is '{type_annotation}'."\n )\n\n return self._load_multiple_inputs(context)\n\n def handle_output(self, context: OutputContext, obj: Any):\n if context.dagster_type.typing_type == type(None):\n check.invariant(\n obj is None,\n "Output had Nothing type or 'None' annotation, but handle_output received"\n f" value that was not None and was of type {type(obj)}.",\n )\n return None\n\n if context.has_asset_partitions:\n paths = self._get_paths_for_partitions(context)\n\n check.invariant(\n len(paths) == 1,\n f"The current IO manager {type(self)} does not support persisting an output"\n " associated with multiple partitions. This error is likely occurring because a"\n " backfill was launched using the 'single run' option. Instead, launch the"\n " backfill with the 'multiple runs' option.",\n )\n\n path = next(iter(paths.values()))\n else:\n path = self._get_path(context)\n self.make_directory(path.parent)\n context.log.debug(self.get_writing_output_log_message(path))\n self.dump_to_path(context=context, obj=obj, path=path)\n\n metadata = {"path": MetadataValue.path(str(path))}\n custom_metadata = self.get_metadata(context=context, obj=obj)\n metadata.update(custom_metadata) # type: ignore\n\n context.add_output_metadata(metadata)
\n\n\ndef is_dict_type(type_obj) -> bool:\n if type_obj == dict:\n return True\n\n if hasattr(type_obj, "__origin__") and type_obj.__origin__ in (dict, Dict, Mapping):\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/storage/upath_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.upath_io_manager"}}, "types": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Iterator, Optional, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param\nfrom dagster._config import ConfigType\nfrom dagster._core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..definitions.resource_requirement import (\n    ResourceRequirement,\n    TypeLoaderResourceRequirement,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.system import (\n        DagsterTypeLoaderContext,\n    )\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n """How to create a runtime value from config data."""\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n type_display_name = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys())):\n yield TypeLoaderResourceRequirement(\n key=resource_key, type_display_name=type_display_name\n )
\n\n\n@experimental_param(param="loader_version")\n@experimental_param(param="external_version_fn")\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: Optional[AbstractSet[str]],\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\nDagsterTypeLoaderFn: TypeAlias = Callable[["DagsterTypeLoaderContext", Any], Any]\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n) -> Callable[[DagsterTypeLoaderFn], DagsterTypeLoaderFromDecorator]:\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster._config import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n assert isinstance(\n config_type, ConfigType\n ), f"{config_schema} could not be resolved to config type"\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func: DagsterTypeLoaderFn) -> DagsterTypeLoaderFromDecorator:\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@dagster_type_loader '{func.__name__}' decorated function does not have required"\n f" positional parameter '{missing_positional}'. @dagster_type_loader decorated"\n " functions should only have keyword arguments that match input names and a first"\n " positional parameter named 'context'."\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/types/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import (\n    AbstractSet as TypingAbstractSet,\n    AnyStr,\n    Iterator as TypingIterator,\n    Mapping,\n    Optional as TypingOptional,\n    Sequence,\n    Type as TypingType,\n    cast,\n)\n\nfrom typing_extensions import get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    Array,\n    ConfigType,\n    Noneable as ConfigNoneable,\n)\nfrom dagster._core.definitions.events import DynamicOutput, Output, TypeCheck\nfrom dagster._core.definitions.metadata import (\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._seven import is_subclass\n\nfrom ..definitions.resource_requirement import (\n    RequiresResources,\n    ResourceRequirement,\n    TypeResourceRequirement,\n)\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader\n\nif t.TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n    from dagster._core.execution.context.system import DagsterTypeLoaderContext, TypeCheckContext\n\nTypeCheckFn = t.Callable[["TypeCheckContext", AnyStr], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType(RequiresResources):\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = t.Any,\n metadata: t.Optional[t.Mapping[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self._description = check.opt_str_param(description, "description")\n self._loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n f"All types must have a valid display name, got None for key {key}",\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self._typing_type = typing_type\n\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n
[docs] @public\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n """Type check the value against the type.\n\n Args:\n context (TypeCheckContext): The context of the type check.\n value (Any): The value to check.\n\n Returns:\n TypeCheck: The result of the type check.\n """\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n f"You have returned {retval!r} of type {type(retval)} from the type "\n f'check function of type "{self.key}". Return value must be instance '\n "of TypeCheck or a bool."\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval
\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n def __hash__(self):\n return hash(self.key)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata(self) -> t.Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def required_resource_keys(self) -> TypingAbstractSet[str]:\n """AbstractSet[str]: Set of resource keys required by the type check function."""\n return self._required_resource_keys\n\n @public\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses."""\n return cast(str, self._name or self.key)\n\n @public\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types."""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n f"unique_name requested but is None for type {self.display_name}",\n )\n return self._name\n\n @public\n @property\n def has_unique_name(self) -> bool:\n """bool: Whether the type has a unique name."""\n return self._name is not None\n\n @public\n @property\n def typing_type(self) -> t.Any:\n """Any: The python typing type for this type."""\n return self._typing_type\n\n @public\n @property\n def loader(self) -> t.Optional[DagsterTypeLoader]:\n """Optional[DagsterTypeLoader]: Loader for this type, if any."""\n return self._loader\n\n @public\n @property\n def description(self) -> t.Optional[str]:\n """Optional[str]: Description of the type, or None if not provided."""\n return self._description\n\n @property\n def inner_types(self) -> t.Sequence["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def type_param_keys(self) -> t.Sequence[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before"\n " calling getter.".format(name=self.display_name)\n )\n\n def get_resource_requirements(\n self, _outer_context: TypingOptional[object] = None\n ) -> TypingIterator[ResourceRequirement]:\n for resource_key in sorted(list(self.required_resource_keys)):\n yield TypeResourceRequirement(key=resource_key, type_display_name=self.display_name)\n if self.loader:\n yield from self.loader.get_resource_requirements(outer_context=self.display_name)
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster._seven import get_arg_names\n\n args = get_arg_names(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n f'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n f'type_check_fn argument on type "{name}" must take 2 arguments, received {len(args)}.'\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n typing_type=type(None),\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description=f"Value must be None, got a {type(value)}",\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type"\n f" {dagster_type_name}, expected value to be of Python type"\n f" {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type")\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type],\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type],\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList: DagsterListApi = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(\n python_type: TypingType[t.Any], dagster_type: DagsterType\n) -> None:\n """Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n "used the Python type as an annotation for one of its arguments or for its return "\n "value before make_python_type_usable_as_dagster_type was called, and we "\n "generated a Dagster type to correspond to it. To override the auto-generated "\n "Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n "definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n "be called once on a python type as it is registering a 1:1 mapping "\n "between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=(\n f"DagsterType created from a type hint for the Python type {qualified_name}"\n ),\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster._utils.typing_api import is_typing_type\n\n from ..definitions.result import MaterializeResult\n from .primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from .python_dict import (\n Dict as DDict,\n PythonDict,\n )\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, DagsterType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n # First, check to see if we're using Dagster's generic output type to do the type catching.\n if is_generic_output_annotation(dagster_type):\n type_args = get_args(dagster_type)\n # If no inner type was provided, forward Any type.\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif is_dynamic_output_annotation(dagster_type):\n dynamic_out_annotation = get_args(dagster_type)[0]\n type_args = get_args(dynamic_out_annotation)\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif dagster_type == MaterializeResult:\n # convert MaterializeResult type annotation to Nothing until returning\n # scalar values via MaterializeResult is supported\n # https://github.com/dagster-io/dagster/issues/16887\n dagster_type = Nothing\n\n # Then, check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError as e:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n ) from e\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is DDict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef is_dynamic_output_annotation(dagster_type: object) -> bool:\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n if dagster_type == DynamicOutput or get_origin(dagster_type) == DynamicOutput:\n raise DagsterInvariantViolationError(\n "Op annotated with return type DynamicOutput. DynamicOutputs can only be returned in"\n " the context of a List. If only one output is needed, use the Output API."\n )\n\n if get_origin(dagster_type) == list and len(get_args(dagster_type)) == 1:\n list_inner_type = get_args(dagster_type)[0]\n return list_inner_type == DynamicOutput or get_origin(list_inner_type) == DynamicOutput\n return False\n\n\ndef is_generic_output_annotation(dagster_type: object) -> bool:\n return dagster_type == Output or get_origin(dagster_type) == Output\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """Resolves a Python type to a Dagster type."""\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(\n node_defs: Sequence["NodeDefinition"],\n) -> Mapping[str, DagsterType]:\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n\n def process_node_def(node_def: "NodeDefinition"):\n input_output_types = list(node_def.all_input_output_types())\n for dagster_type in input_output_types:\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n if isinstance(node_def, GraphDefinition):\n for child_node_def in node_def.node_defs:\n process_node_def(child_node_def)\n\n for node_def in node_defs:\n process_node_def(node_def)\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional: DagsterOptionalApi = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/_core/types/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.decorator

\nfrom typing import TYPE_CHECKING, Callable, Optional, Type, TypeVar, Union, overload\n\nimport dagster._check as check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\nif TYPE_CHECKING:\n    from dagster._core.types.config_schema import DagsterTypeLoader\n\nT_Type = TypeVar("T_Type", bound=Type[object])\n\n\n@overload\ndef usable_as_dagster_type(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    loader: Optional["DagsterTypeLoader"] = ...,\n) -> Callable[[T_Type], T_Type]: ...\n\n\n@overload\ndef usable_as_dagster_type(\n    name: T_Type,\n) -> T_Type: ...\n\n\n
[docs]def usable_as_dagster_type(\n name: Optional[Union[str, T_Type]] = None,\n description: Optional[str] = None,\n loader: Optional["DagsterTypeLoader"] = None,\n) -> Union[T_Type, Callable[[T_Type], T_Type]]:\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n\n Examples:\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n # check for no args, no parens case\n if isinstance(name, type):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n def _with_args(bare_cls: T_Type) -> T_Type:\n check.class_param(bare_cls, "bare_cls")\n new_name = check.opt_str_param(name, "name") if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n ),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/_core/types/decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.decorator"}}}, "_serdes": {"config_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._utils import convert_dagster_submodule_name\nfrom dagster._utils.yaml_utils import load_run_config_yaml\n\nfrom .serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._config.config_schema import UserConfigSchema\n\nT_ConfigurableClass = TypeVar("T_ConfigurableClass")\n\n\nclass ConfigurableClassDataSerializer(NamedTupleSerializer["ConfigurableClassData"]):\n    def after_pack(self, **packed: Any) -> Dict[str, Any]:\n        packed["module_name"] = convert_dagster_submodule_name(packed["module_name"], "public")\n        return packed\n\n\n
[docs]@whitelist_for_serdes(serializer=ConfigurableClassDataSerializer)\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name: str, class_name: str, config_yaml: str):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n convert_dagster_submodule_name(check.str_param(module_name, "module_name"), "private"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self) -> Mapping[str, Any]:\n return check.is_dict(load_run_config_yaml(self.config_yaml), key_type=str)\n\n def info_dict(self) -> Mapping[str, Any]:\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n @overload\n def rehydrate(self, as_type: Type[T_ConfigurableClass]) -> T_ConfigurableClass: ...\n\n @overload\n def rehydrate(self, as_type: None = ...) -> "ConfigurableClass": ...\n\n def rehydrate(\n self, as_type: Optional[Type[T_ConfigurableClass]] = None\n ) -> Union["ConfigurableClass", T_ConfigurableClass]:\n from dagster._config import process_config, resolve_to_config_type\n from dagster._core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, as_type or ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, check.not_none(result.value))
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self) -> Optional[ConfigurableClassData]:\n """Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n @classmethod\n @abstractmethod\n def config_type(cls) -> "UserConfigSchema":\n """Get the config type against which to validate a config yaml fragment.\n\n The only place config values matching this type are used is inside `from_config_value`. This\n is an alternative constructor for a class. It is a common pattern for the config type to\n match constructor arguments, so `from_config_value`\n\n The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n ...\n # We need to raise `NotImplementedError` here because nothing prevents abstract class\n # methods from being called.\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")\n\n @classmethod\n @abstractmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n """Create an instance of the ConfigurableClass from a validated config value.\n\n The config value used here should be derived from the accompanying `inst_data` argument.\n `inst_data` contains the yaml-serialized config-- this must be parsed and\n validated/normalized, then passed to this method for object instantiation. This is done in\n ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster._core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """
\n\n\ndef class_from_code_pointer(module_name: str, class_name: str) -> Type[object]:\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/_serdes/config_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._serdes.config_class"}}, "_utils": {"alabaster_version": "0.7.13", "alert": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.alert

\nimport datetime\nimport smtplib\nimport ssl\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.sensor_definition import DefaultSensorStatus, SensorDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.graph_definition import GraphDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.run_status_sensor_definition import RunFailureSensorContext\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\ndef _default_failure_email_body(context: "RunFailureSensorContext") -> str:\n    from dagster._core.host_representation.external_data import DEFAULT_MODE_NAME\n\n    return "<br>".join(\n        [\n            f"Pipeline {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Mode: {DEFAULT_MODE_NAME}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\ndef _default_failure_email_subject(context) -> str:\n    return f"Dagster Run Failed: {context.pipeline_run.job_name}"\n\n\nEMAIL_MESSAGE = """From: {email_from}\nTo: {email_to}\nMIME-Version: 1.0\nContent-type: text/html\nSubject: {email_subject}\n\n{email_body}\n\n<!-- this ensures Gmail doesn't trim the email -->\n<span style="opacity: 0"> {randomness} </span>\n"""\n\n\ndef send_email_via_ssl(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP_SSL(smtp_host, smtp_port, context=context) as server:\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\ndef send_email_via_starttls(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP(smtp_host, smtp_port) as server:\n        server.starttls(context=context)\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_email_on_run_failure_sensor(\n email_from: str,\n email_password: str,\n email_to: Sequence[str],\n email_body_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_body,\n email_subject_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_subject,\n smtp_host: str = "smtp.gmail.com",\n smtp_type: str = "SSL",\n smtp_port: Optional[int] = None,\n name: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> SensorDefinition:\n """Create a job failure sensor that sends email via the SMTP protocol.\n\n Args:\n email_from (str): The sender email address to send the message from.\n email_password (str): The password of the sender.\n email_to (List[str]): The receipt email addresses to send the message to.\n email_body_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email body you want to send.\n Defaults to the plain text that contains error message, job name, and run ID.\n email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email subject you want to send.\n Defaults to "Dagster Run Failed: <job_name>".\n smtp_host (str): The hostname of the SMTP server. Defaults to "smtp.gmail.com".\n smtp_type (str): The protocol; either "SSL" or "STARTTLS". Defaults to SSL.\n smtp_port (Optional[int]): The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.\n name: (Optional[str]): The name of the sensor. Defaults to "email_on_job_failure".\n webserver_base_url: (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails. To monitor jobs in external repositories,\n use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\n sensor. Defaults to None, which means the alert will be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n Examples:\n .. code-block:: python\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n )\n\n @repository\n def my_repo():\n return [my_job + email_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n email_body_fn=my_message_fn,\n email_subject_fn=lambda _: "Dagster Alert",\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n from dagster._core.definitions.run_status_sensor_definition import (\n RunFailureSensorContext,\n run_failure_sensor,\n )\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n monitored_jobs=jobs,\n default_status=default_status,\n monitor_all_repositories=monitor_all_repositories,\n )\n def email_on_run_failure(context: RunFailureSensorContext):\n email_body = email_body_fn(context)\n if webserver_base_url:\n email_body += (\n f'<p><a href="{webserver_base_url}/runs/{context.dagster_run.run_id}">View in'\n " the Dagster UI</a></p>"\n )\n\n message = EMAIL_MESSAGE.format(\n email_to=",".join(email_to),\n email_from=email_from,\n email_subject=email_subject_fn(context),\n email_body=email_body,\n randomness=datetime.datetime.now(),\n )\n\n if smtp_type == "SSL":\n send_email_via_ssl(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 465\n )\n elif smtp_type == "STARTTLS":\n send_email_via_starttls(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 587\n )\n else:\n raise DagsterInvalidDefinitionError(f'smtp_type "{smtp_type}" is not supported.')\n\n return email_on_run_failure
\n
", "current_page_name": "_modules/dagster/_utils/alert", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.alert"}, "body": "

Source code for dagster._utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nimport time\nfrom collections import OrderedDict\nfrom datetime import timezone\nfrom enum import Enum\nfrom signal import Signals\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Generator,\n    Generic,\n    Hashable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n    overload,\n)\n\nimport packaging.version\nfrom typing_extensions import Literal, TypeAlias, TypeGuard\n\nimport dagster._check as check\nimport dagster._seven as seven\n\nfrom .internal_init import IHasInternalInit as IHasInternalInit\n\nif sys.version_info > (3,):\n    from pathlib import Path\nelse:\n    from pathlib2 import Path\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryDefinition,\n    )\n    from dagster._core.events import DagsterEvent\n\nK = TypeVar("K")\nT = TypeVar("T")\nU = TypeVar("U")\nV = TypeVar("V")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\nPrintFn: TypeAlias = Callable[[Any], None]\n\nSingleInstigatorDebugCrashFlags: TypeAlias = Mapping[str, int]\nDebugCrashFlags: TypeAlias = Mapping[str, SingleInstigatorDebugCrashFlags]\n\n\n# Use this to get the "library version" (pre-1.0 version) from the "core version" (post 1.0\n# version). 16 is from the 0.16.0 that library versions stayed on when core went to 1.0.0.\ndef library_version_from_core_version(core_version: str) -> str:\n    parsed_version = parse_package_version(core_version)\n\n    release = parsed_version.release\n    if release[0] >= 1:\n        library_version = ".".join(["0", str(16 + release[1]), str(release[2])])\n\n        if parsed_version.is_prerelease:\n            library_version = library_version + "".join(\n                [str(pre) for pre in check.not_none(parsed_version.pre)]\n            )\n\n        if parsed_version.is_postrelease:\n            library_version = library_version + "post" + str(parsed_version.post)\n\n        return library_version\n    else:\n        return core_version\n\n\ndef parse_package_version(version_str: str) -> packaging.version.Version:\n    parsed_version = packaging.version.parse(version_str)\n    assert isinstance(parsed_version, packaging.version.Version)\n    return parsed_version\n\n\ndef convert_dagster_submodule_name(name: str, mode: Literal["private", "public"]) -> str:\n    """This function was introduced when all Dagster submodules were marked private by\n    underscore-prefixing the root submodules (e.g. `dagster._core`). The function provides\n    backcompatibility by converting modules between the old and new (i.e. public and private) forms.\n    This is needed when reading older data or communicating with older versions of Dagster.\n    """\n    if mode == "private":\n        return re.sub(r"^dagster\\.([^_])", r"dagster._\\1", name)\n    elif mode == "public":\n        return re.sub(r"^dagster._", "dagster.", name)\n    else:\n        check.failed("`mode` must be 'private' or 'public'")\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test.\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string: str) -> str:\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict: Mapping[T, U]) -> Tuple[T, U]:\n check.mapping_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return next(iter(ddict.items()))\n\n\n@contextlib.contextmanager\ndef pushd(path: str) -> Iterator[str]:\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path: str) -> bool:\n """Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path: str) -> str:\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n return path\n else:\n raise\n\n\ndef hash_collection(\n collection: Union[\n Mapping[Hashable, Any], Sequence[Any], AbstractSet[Any], Tuple[Any, ...], NamedTuple\n ]\n) -> int:\n """Hash a mutable collection or immutable collection containing mutable elements.\n\n This is useful for hashing Dagster-specific NamedTuples that contain mutable lists or dicts.\n The default NamedTuple __hash__ function assumes the contents of the NamedTuple are themselves\n hashable, and will throw an error if they are not. This can occur when trying to e.g. compute a\n cache key for the tuple for use with `lru_cache`.\n\n This alternative implementation will recursively process collection elements to convert basic\n lists and dicts to tuples prior to hashing. It is recommended to cache the result:\n\n Example:\n .. code-block:: python\n\n def __hash__(self):\n if not hasattr(self, '_hash'):\n self._hash = hash_named_tuple(self)\n return self._hash\n """\n assert isinstance(\n collection, (list, dict, set, tuple)\n ), f"Cannot hash collection of type {type(collection)}"\n return hash(make_hashable(collection))\n\n\n@overload\ndef make_hashable(value: Union[List[Any], Set[Any]]) -> Tuple[Any, ...]: ...\n\n\n@overload\ndef make_hashable(value: Dict[Any, Any]) -> Tuple[Tuple[Any, Any]]: ...\n\n\n@overload\ndef make_hashable(value: Any) -> Any: ...\n\n\ndef make_hashable(value: Any) -> Any:\n if isinstance(value, dict):\n return tuple(sorted((key, make_hashable(value)) for key, value in value.items()))\n elif isinstance(value, (list, tuple, set)):\n return tuple([make_hashable(x) for x in value])\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_job(path, pipeline_fn_name, env_file=None):\n from dagster._core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # noqa: T201\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n path = None\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if path is not None and os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(\n thing_or_gen: Union[T, Iterator[T], Generator[T, Any, Any]]\n) -> Generator[T, Any, Any]:\n if not inspect.isgenerator(thing_or_gen):\n thing_or_gen = cast(T, thing_or_gen)\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path: str) -> str:\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n return file_path\n\n\ndef ensure_file(path: str) -> str:\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n return path\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a", encoding="utf8"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if seven.IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager[Any]], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt: datetime.datetime) -> float:\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\nT_GeneratedContext = TypeVar("T_GeneratedContext")\n\n\nclass EventGenerationManager(Generic[T_GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Iterator[Union["DagsterEvent", T_GeneratedContext]],\n object_cls: Type[T_GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[T_GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[T_GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n f"generator never yielded object of type {self.object_cls.__name__}",\n )\n\n def get_object(self) -> T_GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(T_GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp: float) -> datetime.datetime:\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt: datetime.datetime) -> datetime.datetime:\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value: object) -> bool:\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root() -> str:\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault() -> None:\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port() -> int:\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\ndef is_port_in_use(host, port) -> bool:\n # Similar to the socket options that uvicorn uses to bind ports:\n # https://github.com/encode/uvicorn/blob/62f19c1c39929c84968712c371c9b7b96a041dec/uvicorn/config.py#L565-L566\n sock = socket.socket(family=socket.AF_INET)\n sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n try:\n sock.bind((host, port))\n return False\n except socket.error as e:\n return e.errno == errno.EADDRINUSE\n finally:\n sock.close()\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add: Sequence[str], to_remove: Sequence[str]) -> Iterator[None]:\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules() -> Iterator[None]:\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid: int) -> bool:\n if seven.IS_WINDOWS:\n import psutil\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).""" # noqa: D402\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> Mapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\nT_Callable = TypeVar("T_Callable", bound=Callable)\n\n\ndef traced(func: T_Callable) -> T_Callable:\n """A decorator that keeps track of how many times a function is called."""\n\n @functools.wraps(func)\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return cast(T_Callable, inner)\n\n\ndef get_terminate_signal():\n if sys.platform == "win32":\n return signal.SIGTERM\n return signal.SIGKILL\n\n\ndef get_run_crash_explanation(prefix: str, exit_code: int):\n # As per https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess.returncode\n # negative exit code means a posix signal\n if exit_code < 0 and -exit_code in [signal.value for signal in Signals]:\n posix_signal = -exit_code\n signal_str = Signals(posix_signal).name\n exit_clause = f"was terminated by signal {posix_signal} ({signal_str})."\n if posix_signal == get_terminate_signal():\n exit_clause = (\n exit_clause\n + " This usually indicates that the process was"\n " killed by the operating system due to running out of"\n " memory. Possible solutions include increasing the"\n " amount of memory available to the run, reducing"\n " the amount of memory used by the ops in the run, or"\n " configuring the executor to run fewer ops concurrently."\n )\n else:\n exit_clause = f"unexpectedly exited with code {exit_code}."\n\n return prefix + " " + exit_clause\n\n\ndef last_file_comp(path: str) -> str:\n return os.path.basename(os.path.normpath(path))\n\n\ndef is_named_tuple_instance(obj: object) -> TypeGuard[NamedTuple]:\n return isinstance(obj, tuple) and hasattr(obj, "_fields")\n\n\ndef is_named_tuple_subclass(klass: Type[object]) -> TypeGuard[Type[NamedTuple]]:\n return isinstance(klass, type) and issubclass(klass, tuple) and hasattr(klass, "_fields")\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[True] = ...,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[False] = ...,\n) -> Optional["RepositoryDefinition"]: ...\n\n\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = None,\n repository: Optional["RepositoryDefinition"] = None,\n error_on_none: bool = True,\n) -> Optional["RepositoryDefinition"]:\n """Normalizes the arguments that take a RepositoryDefinition or Definitions object to a\n RepositoryDefinition.\n\n This is intended to handle both the case where a single argument takes a\n `Union[RepositoryDefinition, Definitions]` or separate keyword arguments accept\n `RepositoryDefinition` or `Definitions`.\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n if (definitions_or_repository and repository) or (\n error_on_none and not (definitions_or_repository or repository)\n ):\n check.failed("Exactly one of `definitions` or `repository_def` must be provided.")\n elif isinstance(definitions_or_repository, Definitions):\n return definitions_or_repository.get_repository_def()\n elif definitions_or_repository:\n return definitions_or_repository\n elif repository:\n return repository\n else:\n return None\n\n\ndef xor(a, b):\n return bool(a) != bool(b)\n\n\ndef tail_file(path_or_fd: Union[str, int], should_stop: Callable[[], bool]) -> Iterator[str]:\n with open(path_or_fd, "r") as output_stream:\n while True:\n line = output_stream.readline()\n if line:\n yield line\n elif should_stop():\n break\n else:\n time.sleep(0.01)\n
", "current_page_name": "_modules/dagster/_utils", "customsidebar": null, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.dagster_type

\nfrom typing import Any\n\nfrom dagster._core.definitions.events import Failure, TypeCheck\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.api import create_execution_plan\nfrom dagster._core.execution.context_creation_job import scoped_job_context\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.types.dagster_type import resolve_dagster_type\n\nfrom .typing_api import is_typing_type\n\n\n
[docs]def check_dagster_type(dagster_type: Any, value: Any) -> TypeCheck:\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n f"Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n\n job = InMemoryJob(GraphDefinition(node_defs=[], name="empty").to_job())\n job_def = job.get_definition()\n\n instance = DagsterInstance.ephemeral()\n execution_plan = create_execution_plan(job)\n dagster_run = instance.create_run_for_job(job_def)\n with scoped_job_context(execution_plan, job, {}, dagster_run, instance) as context:\n type_check_context = context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(type_check_context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n
", "current_page_name": "_modules/dagster/_utils/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.dagster_type"}, "favicon_url": null, "forked_pdb": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child.\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster._utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin", encoding="utf8")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/_utils/forked_pdb", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.forked_pdb"}, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom typing import Mapping, NamedTuple, Optional\n\nimport coloredlogs\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import deprecated\nfrom dagster._config import Enum, EnumValue\nfrom dagster._core.definitions.logger_definition import logger\nfrom dagster._core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path: str):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Mapping[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Mapping[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.mapping_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path: str, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,  # type: ignore\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@deprecated(\n breaking_version="2.0",\n subject="loggers.dagit",\n emit_runtime_warning=False,\n)\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": log_level,\n },\n # Only one of dagster or dagster-webserver will be used at a time. We configure them\n # both here to avoid a dependency on the dagster-webserver package.\n "dagit": {\n "handlers": [handler],\n "level": log_level,\n },\n "dagster-webserver": {\n "handlers": [handler],\n "level": log_level,\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n\ndef create_console_logger(name, level):\n klass = logging.getLoggerClass()\n handler = klass(name, level=level)\n coloredlogs.install(\n logger=handler,\n level=level,\n fmt=default_format_string(),\n datefmt=default_date_format_string(),\n field_styles={"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n level_styles={"debug": {}, "error": {"color": "red"}},\n )\n return handler\n
", "current_page_name": "_modules/dagster/_utils/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.log"}, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils", "warnings": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.warnings

\nimport warnings\nfrom contextlib import contextmanager\nfrom typing import Callable, Iterator, Optional, TypeVar\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import (\n    Decoratable,\n    apply_context_manager_decorator,\n)\n\nT = TypeVar("T")\n\n# ########################\n# ##### DEPRECATED\n# ########################\n\n\ndef normalize_renamed_param(\n    new_val: T,\n    new_arg: str,\n    old_val: T,\n    old_arg: str,\n    coerce_old_to_new: Optional[Callable[[T], T]] = None,\n) -> T:\n    """Utility for managing backwards compatibility of a renamed parameter.\n\n    .. code-block::\n\n       # The name of param `old_flag` is being updated to `new_flag`, but we are temporarily\n       # accepting either param.\n       def is_new(old_flag=None, new_flag=None):\n           return canonicalize_backcompat_args(\n               new_val=new_flag,\n               new_arg='new_flag',\n               old_val=old_flag,\n               old_arg='old_flag',\n               breaking_version='0.9.0',\n               coerce_old_to_new=lambda val: not val,\n           )\n\n    In the above example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets the new_flag, it's returned unaltered. If the caller sets\n    old_flag, it will return the old_flag run through the coercion function.\n    """\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    if new_val is not None and old_val is not None:\n        check.failed(f'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".')\n    elif old_val is not None:\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n    else:\n        return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_text: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_text) if additional_warn_text else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n# ########################\n# ##### EXPERIMENTAL\n# ########################\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_warning(\n subject: str, additional_warn_text: Optional[str] = None, stacklevel: int = 3\n) -> None:\n extra_text = f" {additional_warn_text}" if additional_warn_text else ""\n warnings.warn(\n f"{subject} is experimental. It may break in future versions, even between dot"\n f" releases.{extra_text} {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\n# ########################\n# ##### DISABLE DAGSTER WARNINGS\n# ########################\n\n\n@contextmanager\ndef disable_dagster_warnings() -> Iterator[None]:\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=DeprecationWarning)\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n yield\n\n\nT_Decoratable = TypeVar("T_Decoratable", bound=Decoratable)\n\n\ndef suppress_dagster_warnings(__obj: T_Decoratable) -> T_Decoratable:\n """Mark a method/function as ignoring Dagster-generated warnings. This suppresses any\n `ExperimentalWarnings` or `DeprecationWarnings` when the function is called.\n\n Usage:\n\n .. code-block:: python\n\n @suppress_dagster_warnings\n def invokes_some_experimental_stuff(my_arg):\n my_experimental_function(my_arg)\n """\n return apply_context_manager_decorator(__obj, disable_dagster_warnings)\n
", "current_page_name": "_modules/dagster/_utils/warnings", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.warnings"}}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.asset_defs

\nimport hashlib\nimport inspect\nimport os\nimport re\nfrom abc import abstractmethod\nfrom functools import partial\nfrom itertools import chain\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    Nothing,\n    Output,\n    ResourceDefinition,\n    SourceAsset,\n    _check as check,\n)\nfrom dagster._core.definitions import AssetsDefinition, multi_asset\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue\nfrom dagster._core.definitions.metadata.table import TableSchema\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource\nfrom dagster_airbyte.types import AirbyteTableMetadata\nfrom dagster_airbyte.utils import (\n    generate_materializations,\n    generate_table_schema,\n    is_basic_normalization_operation,\n)\n\n\ndef _table_to_output_name_fn(table: str) -> str:\n    return table.replace("-", "_")\n\n\ndef _build_airbyte_asset_defn_metadata(\n    connection_id: str,\n    destination_tables: Sequence[str],\n    table_to_asset_key_fn: Callable[[str], AssetKey],\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n    upstream_assets: Optional[Iterable[AssetKey]] = None,\n    group_name: Optional[str] = None,\n    io_manager_key: Optional[str] = None,\n    schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n    freshness_policy: Optional[FreshnessPolicy] = None,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n) -> AssetsDefinitionCacheableData:\n    asset_key_prefix = (\n        check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str) or []\n    )\n\n    # Generate a list of outputs, the set of destination tables plus any affiliated\n    # normalization tables\n    tables = list(\n        chain.from_iterable(\n            chain(\n                [destination_tables], normalization_tables.values() if normalization_tables else []\n            )\n        )\n    )\n\n    outputs = {\n        _table_to_output_name_fn(table): AssetKey(\n            [*asset_key_prefix, *table_to_asset_key_fn(table).path]\n        )\n        for table in tables\n    }\n\n    internal_deps: Dict[str, Set[AssetKey]] = {}\n\n    metadata_encodable_normalization_tables = (\n        {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}\n    )\n\n    # If normalization tables are specified, we need to add a dependency from the destination table\n    # to the affilitated normalization table\n    if len(metadata_encodable_normalization_tables) > 0:\n        for base_table, derived_tables in metadata_encodable_normalization_tables.items():\n            for derived_table in derived_tables:\n                internal_deps[derived_table] = {\n                    AssetKey([*asset_key_prefix, *table_to_asset_key_fn(base_table).path])\n                }\n\n    # All non-normalization tables depend on any user-provided upstream assets\n    for table in destination_tables:\n        internal_deps[table] = set(upstream_assets or [])\n\n    return AssetsDefinitionCacheableData(\n        keys_by_input_name=(\n            {asset_key.path[-1]: asset_key for asset_key in upstream_assets}\n            if upstream_assets\n            else {}\n        ),\n        keys_by_output_name=outputs,\n        internal_asset_deps=internal_deps,\n        group_name=group_name,\n        key_prefix=asset_key_prefix,\n        can_subset=False,\n        metadata_by_output_name=(\n            {\n                table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n                for table in tables\n            }\n            if schema_by_table_name\n            else None\n        ),\n        freshness_policies_by_output_name=(\n            {output: freshness_policy for output in outputs} if freshness_policy else None\n        ),\n        auto_materialize_policies_by_output_name=(\n            {output: auto_materialize_policy for output in outputs}\n            if auto_materialize_policy\n            else None\n        ),\n        extra_metadata={\n            "connection_id": connection_id,\n            "group_name": group_name,\n            "destination_tables": destination_tables,\n            "normalization_tables": metadata_encodable_normalization_tables,\n            "io_manager_key": io_manager_key,\n        },\n    )\n\n\ndef _build_airbyte_assets_from_metadata(\n    assets_defn_meta: AssetsDefinitionCacheableData,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]],\n) -> AssetsDefinition:\n    metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n    connection_id = cast(str, metadata["connection_id"])\n    group_name = cast(Optional[str], metadata["group_name"])\n    destination_tables = cast(List[str], metadata["destination_tables"])\n    normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])\n    io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n    @multi_asset(\n        name=f"airbyte_sync_{connection_id[:5]}",\n        deps=list((assets_defn_meta.keys_by_input_name or {}).values()),\n        outs={\n            k: AssetOut(\n                key=v,\n                metadata=(\n                    {\n                        k: cast(TableSchemaMetadataValue, v)\n                        for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()\n                    }\n                    if assets_defn_meta.metadata_by_output_name\n                    else None\n                ),\n                io_manager_key=io_manager_key,\n                freshness_policy=(\n                    assets_defn_meta.freshness_policies_by_output_name.get(k)\n                    if assets_defn_meta.freshness_policies_by_output_name\n                    else None\n                ),\n                dagster_type=Nothing,\n            )\n            for k, v in (assets_defn_meta.keys_by_output_name or {}).items()\n        },\n        internal_asset_deps={\n            k: set(v) for k, v in (assets_defn_meta.internal_asset_deps or {}).items()\n        },\n        compute_kind="airbyte",\n        group_name=group_name,\n        resource_defs=resource_defs,\n    )\n    def _assets(context, airbyte: AirbyteResource):\n        ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n        for materialization in generate_materializations(\n            ab_output, assets_defn_meta.key_prefix or []\n        ):\n            table_name = materialization.asset_key.path[-1]\n            if table_name in destination_tables:\n                yield Output(\n                    value=None,\n                    output_name=_table_to_output_name_fn(table_name),\n                    metadata=materialization.metadata,\n                )\n                # Also materialize any normalization tables affiliated with this destination\n                # e.g. nested objects, lists etc\n                if normalization_tables:\n                    for dependent_table in normalization_tables.get(table_name, set()):\n                        yield Output(\n                            value=None,\n                            output_name=_table_to_output_name_fn(dependent_table),\n                        )\n            else:\n                yield materialization\n\n    return _assets\n\n\n
[docs]def build_airbyte_assets(\n connection_id: str,\n destination_tables: Sequence[str],\n asset_key_prefix: Optional[Sequence[str]] = None,\n group_name: Optional[str] = None,\n normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n upstream_assets: Optional[Set[AssetKey]] = None,\n schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n stream_to_asset_map: Optional[Mapping[str, str]] = None,\n) -> Sequence[AssetsDefinition]:\n """Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's\n normalization feature, you may specify a mapping of destination table to a list of\n derived tables that will be created by the normalization process.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]):\n A list of assets to add as sources.\n upstream_assets (Optional[Set[AssetKey]]): Deprecated, use deps instead. A list of assets to add as sources.\n freshness_policy (Optional[FreshnessPolicy]): A freshness policy to apply to the assets\n stream_to_asset_map (Optional[Mapping[str, str]]): A mapping of an Airbyte stream name to a Dagster asset.\n This allows the use of the "prefix" setting in Airbyte with special characters that aren't valid asset names.\n """\n if upstream_assets is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and upstream_assets to build_airbyte_assets. Use only deps"\n " instead."\n )\n\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n # Generate a list of outputs, the set of destination tables plus any affiliated\n # normalization tables\n tables = chain.from_iterable(\n chain([destination_tables], normalization_tables.values() if normalization_tables else [])\n )\n outputs = {\n table: AssetOut(\n key=AssetKey([*asset_key_prefix, table]),\n metadata=(\n {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n if schema_by_table_name\n else None\n ),\n freshness_policy=freshness_policy,\n )\n for table in tables\n }\n\n internal_deps = {}\n\n # If normalization tables are specified, we need to add a dependency from the destination table\n # to the affilitated normalization table\n if normalization_tables:\n for base_table, derived_tables in normalization_tables.items():\n for derived_table in derived_tables:\n internal_deps[derived_table] = {AssetKey([*asset_key_prefix, base_table])}\n\n upstream_deps = deps\n if upstream_assets is not None:\n upstream_deps = list(upstream_assets)\n\n # All non-normalization tables depend on any user-provided upstream assets\n for table in destination_tables:\n internal_deps[table] = set(upstream_deps) if upstream_deps else set()\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n deps=upstream_deps,\n outs=outputs,\n internal_asset_deps=internal_deps,\n compute_kind="airbyte",\n group_name=group_name,\n )\n def _assets(context, airbyte: BaseAirbyteResource):\n ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n\n # No connection details (e.g. using Airbyte Cloud) means we just assume\n # that the outputs were produced\n if len(ab_output.connection_details) == 0:\n for table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n )\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n for materialization in generate_materializations(\n ab_output, asset_key_prefix, stream_to_asset_map\n ):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n metadata=materialization.metadata,\n )\n # Also materialize any normalization tables affiliated with this destination\n # e.g. nested objects, lists etc\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n yield materialization\n\n return [_assets]
\n\n\ndef _get_schema_types(schema: Mapping[str, Any]) -> Sequence[str]:\n """Given a schema definition, return a list of data types that are valid for this schema."""\n types = schema.get("types") or schema.get("type")\n if not types:\n return []\n if isinstance(types, str):\n return [types]\n return types\n\n\ndef _get_sub_schemas(schema: Mapping[str, Any]) -> Sequence[Mapping[str, Any]]:\n """Returns a list of sub-schema definitions for a given schema. This is used to handle union types."""\n return schema.get("anyOf") or schema.get("oneOf") or [schema]\n\n\ndef _get_normalization_tables_for_schema(\n key: str, schema: Mapping[str, Any], prefix: str = ""\n) -> Mapping[str, AirbyteTableMetadata]:\n """Recursively traverses a schema, returning metadata for the tables that will be created by the Airbyte\n normalization process.\n\n For example, a table `cars` with a nested object field `limited_editions` will produce the tables\n `cars` and `cars_limited_editions`.\n\n For more information on Airbyte's normalization process, see:\n https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting\n """\n out: Dict[str, AirbyteTableMetadata] = {}\n # Object types are broken into a new table, as long as they have children\n\n sub_schemas = _get_sub_schemas(schema)\n\n for sub_schema in sub_schemas:\n schema_types = _get_schema_types(sub_schema)\n if not schema_types:\n continue\n\n if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("properties", {}))\n )\n for k, v in sub_schema["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n # Array types are also broken into a new table\n elif "array" in schema_types:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))\n )\n if sub_schema.get("items", {}).get("properties"):\n for k, v in sub_schema["items"]["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n\n return out\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\nclass AirbyteConnectionMetadata(\n NamedTuple(\n "_AirbyteConnectionMetadata",\n [\n ("name", str),\n ("stream_prefix", str),\n ("has_basic_normalization", bool),\n ("stream_data", List[Mapping[str, Any]]),\n ],\n )\n):\n """Contains information about an Airbyte connection.\n\n Attributes:\n name (str): The name of the connection.\n stream_prefix (str): A prefix to add to all stream names.\n has_basic_normalization (bool): Whether or not the connection has basic normalization enabled.\n stream_data (List[Mapping[str, Any]]): Unparsed list of dicts with information about each stream.\n """\n\n @classmethod\n def from_api_json(\n cls, contents: Mapping[str, Any], operations: Mapping[str, Any]\n ) -> "AirbyteConnectionMetadata":\n return cls(\n name=contents["name"],\n stream_prefix=contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operatorConfiguration", {}))\n for op in operations.get("operations", [])\n ),\n stream_data=contents.get("syncCatalog", {}).get("streams", []),\n )\n\n @classmethod\n def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":\n config_contents = cast(Mapping[str, Any], contents.get("configuration"))\n check.invariant(\n config_contents is not None, "Airbyte connection config is missing 'configuration' key"\n )\n\n return cls(\n name=contents["resource_name"],\n stream_prefix=config_contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operator_configuration", {}))\n for op in config_contents.get("operations", [])\n ),\n stream_data=config_contents.get("sync_catalog", {}).get("streams", []),\n )\n\n def parse_stream_tables(\n self, return_normalization_tables: bool = False\n ) -> Mapping[str, AirbyteTableMetadata]:\n """Parses the stream data and returns a mapping, with keys representing destination\n tables associated with each enabled stream and values representing any affiliated\n tables created by Airbyte's normalization process, if enabled.\n """\n tables: Dict[str, AirbyteTableMetadata] = {}\n\n enabled_streams = [\n stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)\n ]\n\n for stream in enabled_streams:\n name = cast(str, stream.get("stream", {}).get("name"))\n prefixed_name = f"{self.stream_prefix}{name}"\n\n schema = (\n stream["stream"]["json_schema"]\n if "json_schema" in stream["stream"]\n else stream["stream"]["jsonSchema"]\n )\n normalization_tables: Dict[str, AirbyteTableMetadata] = {}\n schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))\n if self.has_basic_normalization and return_normalization_tables:\n for k, v in schema_props.items():\n for normalization_table_name, meta in _get_normalization_tables_for_schema(\n k, v, f"{name}_"\n ).items():\n prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"\n normalization_tables[prefixed_norm_table_name] = meta\n tables[prefixed_name] = AirbyteTableMetadata(\n schema=generate_table_schema(schema_props),\n normalization_tables=normalization_tables,\n )\n\n return tables\n\n\ndef _get_schema_by_table_name(\n stream_table_metadata: Mapping[str, AirbyteTableMetadata]\n) -> Mapping[str, TableSchema]:\n schema_by_base_table_name = [(k, v.schema) for k, v in stream_table_metadata.items()]\n schema_by_normalization_table_name = list(\n chain.from_iterable(\n [\n [\n (k, v.schema)\n for k, v in cast(\n Dict[str, AirbyteTableMetadata], meta.normalization_tables\n ).items()\n ]\n for meta in stream_table_metadata.values()\n ]\n )\n )\n\n return dict(schema_by_normalization_table_name + schema_by_base_table_name)\n\n\nclass AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n self._key_prefix = key_prefix\n self._create_assets_for_normalization_tables = create_assets_for_normalization_tables\n self._connection_to_group_fn = connection_to_group_fn\n self._connection_to_io_manager_key_fn = connection_to_io_manager_key_fn\n self._connection_filter = connection_filter\n self._connection_to_asset_key_fn: Callable[[AirbyteConnectionMetadata, str], AssetKey] = (\n connection_to_asset_key_fn or (lambda _, table: AssetKey(path=[table]))\n )\n self._connection_to_freshness_policy_fn = connection_to_freshness_policy_fn or (\n lambda _: None\n )\n self._connection_to_auto_materialize_policy_fn = (\n connection_to_auto_materialize_policy_fn or (lambda _: None)\n )\n\n contents = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n contents.update(",".join(key_prefix).encode("utf-8"))\n contents.update(str(create_assets_for_normalization_tables).encode("utf-8"))\n if connection_filter:\n contents.update(inspect.getsource(connection_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")\n\n @abstractmethod\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n pass\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connection_id, connection in self._get_connections():\n stream_table_metadata = connection.parse_stream_tables(\n self._create_assets_for_normalization_tables\n )\n schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)\n\n table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)\n asset_data_for_conn = _build_airbyte_asset_defn_metadata(\n connection_id=connection_id,\n destination_tables=list(stream_table_metadata.keys()),\n normalization_tables={\n table: set(metadata.normalization_tables.keys())\n for table, metadata in stream_table_metadata.items()\n },\n asset_key_prefix=self._key_prefix,\n group_name=(\n self._connection_to_group_fn(connection.name)\n if self._connection_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connection_to_io_manager_key_fn(connection.name)\n if self._connection_to_io_manager_key_fn\n else None\n ),\n schema_by_table_name=schema_by_table_name,\n table_to_asset_key_fn=table_to_asset_key,\n freshness_policy=self._connection_to_freshness_policy_fn(connection),\n auto_materialize_policy=self._connection_to_auto_materialize_policy_fn(connection),\n )\n\n asset_defn_data.append(asset_data_for_conn)\n\n return asset_defn_data\n\n def _build_definitions_with_resources(\n self,\n data: Sequence[AssetsDefinitionCacheableData],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n ) -> Sequence[AssetsDefinition]:\n return [_build_airbyte_assets_from_metadata(meta, resource_defs) for meta in data]\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return self._build_definitions_with_resources(data)\n\n\nclass AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: Union[ResourceDefinition, AirbyteResource],\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._airbyte_instance: AirbyteResource = (\n airbyte_resource_def.process_config_and_initialize()\n if isinstance(airbyte_resource_def, AirbyteResource)\n else airbyte_resource_def(build_init_resource_context())\n )\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n workspace_id = self._workspace_id\n if not workspace_id:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})\n ).get("workspaces", []),\n )\n\n check.invariant(len(workspaces) <= 1, "Airbyte instance has more than one workspace")\n check.invariant(len(workspaces) > 0, "Airbyte instance has no workspaces")\n\n workspace_id = workspaces[0].get("workspaceId")\n\n connections = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/connections/list", data={"workspaceId": workspace_id}\n )\n ).get("connections", []),\n )\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n for connection_json in connections:\n connection_id = cast(str, connection_json.get("connectionId"))\n\n operations_json = cast(\n Dict[str, Any],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/operations/list",\n data={"connectionId": connection_id},\n )\n ),\n )\n connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return super()._build_definitions_with_resources(\n data, {"airbyte": self._airbyte_instance.get_resource_definition()}\n )\n\n\nclass AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n project_dir: str,\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_directories: Optional[Sequence[str]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._project_dir = project_dir\n self._connection_directories = connection_directories\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n connections_dir = os.path.join(self._project_dir, "connections")\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n\n connection_directories = self._connection_directories or os.listdir(connections_dir)\n for connection_name in connection_directories:\n connection_dir = os.path.join(connections_dir, connection_name)\n with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:\n connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n if self._workspace_id:\n state_file = f"state_{self._workspace_id}.yaml"\n check.invariant(\n state_file in os.listdir(connection_dir),\n f"Workspace state file {state_file} not found",\n )\n else:\n state_files = [\n filename\n for filename in os.listdir(connection_dir)\n if filename.startswith("state_")\n ]\n check.invariant(\n len(state_files) > 0,\n f"No state files found for connection {connection_name} in {connection_dir}",\n )\n check.invariant(\n len(state_files) <= 1,\n "More than one state file found for connection {} in {}, specify a workspace_id"\n " to disambiguate".format(connection_name, connection_dir),\n )\n state_file = state_files[0]\n\n with open(os.path.join(connection_dir, cast(str, state_file)), encoding="utf-8") as f:\n state = yaml.safe_load(f.read())\n connection_id = state.get("resource_id")\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n\n
[docs]def load_assets_from_airbyte_instance(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\n about defined connections at initialization time, and will error on workspace load if the Airbyte\n instance is not reachable.\n\n Args:\n airbyte (ResourceDefinition): An AirbyteResource configured with the appropriate connection\n details.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspaces exist in your instance.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which takes\n in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function\n which takes in connection metadata and returns a freshness policy for the connection's assets. If None, no freshness policies\n will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]): Optional\n function which takes in connection metadata and returns an auto materialization policy for the connection's assets. If None, no\n auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(\n airbyte_instance,\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(airbyte, AirbyteCloudResource):\n raise DagsterInvalidInvocationError(\n "load_assets_from_airbyte_instance is not yet supported for AirbyteCloudResource"\n )\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteInstanceCacheableAssetsDefinition(\n airbyte_resource_def=airbyte,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n\n\n
[docs]def load_assets_from_airbyte_project(\n project_dir: str,\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_directories: Optional[Sequence[str]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads an Airbyte project into a set of Dagster assets.\n\n Point to the root folder of an Airbyte project synced using the Octavia CLI. For\n more information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.\n\n Args:\n project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,\n and connections folders.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspace state YAMLfiles exist in the project.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which\n takes in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.\n If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\n if the project has many connections or if the connection yaml files are large.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):\n Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.\n If None, no freshness policies will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):\n Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.\n If None, no auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n )\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteYAMLCacheableAssetsDefinition(\n project_dir=project_dir,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_directories=connection_directories,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.asset_defs"}, "managed": {"generated": {"destinations": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.destinations

\n# ruff: noqa: A001, A002\nfrom typing import Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteDestination\n\n\n
[docs]class DynamodbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n dynamodb_table_name_prefix: str,\n dynamodb_region: str,\n access_key_id: str,\n secret_access_key: str,\n dynamodb_endpoint: Optional[str] = None,\n ):\n """Airbyte Destination for Dynamodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb\n\n Args:\n name (str): The name of the destination.\n dynamodb_endpoint (Optional[str]): This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).\n dynamodb_table_name_prefix (str): The prefix to use when naming DynamoDB tables.\n dynamodb_region (str): The region of the DynamoDB.\n access_key_id (str): The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.\n secret_access_key (str): The corresponding secret to the access key id.\n """\n self.dynamodb_endpoint = check.opt_str_param(dynamodb_endpoint, "dynamodb_endpoint")\n self.dynamodb_table_name_prefix = check.str_param(\n dynamodb_table_name_prefix, "dynamodb_table_name_prefix"\n )\n self.dynamodb_region = check.str_param(dynamodb_region, "dynamodb_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n super().__init__("Dynamodb", name)
\n\n\n
[docs]class BigqueryDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_location: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDestination.StandardInserts", "BigqueryDestination.GCSStaging"\n ],\n credentials_json: Optional[str] = None,\n transformation_priority: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_location (str): The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n transformation_priority (Optional[str]): Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default "interactive" value is used if not set explicitly.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_location = check.str_param(dataset_location, "dataset_location")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.transformation_priority = check.opt_str_param(\n transformation_priority, "transformation_priority"\n )\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery", name)
\n\n\n
[docs]class RabbitmqDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n routing_key: str,\n ssl: Optional[bool] = None,\n port: Optional[int] = None,\n virtual_host: Optional[str] = None,\n username: Optional[str] = None,\n password: Optional[str] = None,\n exchange: Optional[str] = None,\n ):\n """Airbyte Destination for Rabbitmq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq\n\n Args:\n name (str): The name of the destination.\n ssl (Optional[bool]): SSL enabled.\n host (str): The RabbitMQ host name.\n port (Optional[int]): The RabbitMQ port.\n virtual_host (Optional[str]): The RabbitMQ virtual host name.\n username (Optional[str]): The username to connect.\n password (Optional[str]): The password to connect.\n exchange (Optional[str]): The exchange name.\n routing_key (str): The routing key.\n """\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.virtual_host = check.opt_str_param(virtual_host, "virtual_host")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.exchange = check.opt_str_param(exchange, "exchange")\n self.routing_key = check.str_param(routing_key, "routing_key")\n super().__init__("Rabbitmq", name)
\n\n\n
[docs]class KvdbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, bucket_id: str, secret_key: str):\n """Airbyte Destination for Kvdb.\n\n Documentation can be found at https://kvdb.io/docs/api/\n\n Args:\n name (str): The name of the destination.\n bucket_id (str): The ID of your KVdb bucket.\n secret_key (str): Your bucket Secret Key.\n """\n self.bucket_id = check.str_param(bucket_id, "bucket_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n super().__init__("Kvdb", name)
\n\n\n
[docs]class ClickhouseDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Destination for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): HTTP port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class AmazonSqsDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n message_delay: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n message_body_key: Optional[str] = None,\n message_group_id: Optional[str] = None,\n ):\n """Airbyte Destination for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n message_delay (Optional[int]): Modify the Message Delay of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for sending messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for sending messages\n message_body_key (Optional[str]): Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.\n message_group_id (Optional[str]): The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.message_delay = check.opt_int_param(message_delay, "message_delay")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n self.message_body_key = check.opt_str_param(message_body_key, "message_body_key")\n self.message_group_id = check.opt_str_param(message_group_id, "message_group_id")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class MariadbColumnstoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mariadb Columnstore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore\n\n Args:\n name (str): The name of the destination.\n host (str): The Hostname of the database.\n port (int): The Port of the database.\n database (str): Name of the database.\n username (str): The Username which is used to access the database.\n password (Optional[str]): The Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mariadb Columnstore", name)
\n\n\n
[docs]class KinesisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n region: str,\n shardCount: int,\n accessKey: str,\n privateKey: str,\n bufferSize: int,\n ):\n """Airbyte Destination for Kinesis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis\n\n Args:\n name (str): The name of the destination.\n endpoint (str): AWS Kinesis endpoint.\n region (str): AWS region. Your account determines the Regions that are available to you.\n shardCount (int): Number of shards to which the data should be streamed.\n accessKey (str): Generate the AWS Access Key for current user.\n privateKey (str): The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a "recovery phrase".\n bufferSize (int): Buffer size for storing kinesis records before being batch streamed.\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.region = check.str_param(region, "region")\n self.shardCount = check.int_param(shardCount, "shardCount")\n self.accessKey = check.str_param(accessKey, "accessKey")\n self.privateKey = check.str_param(privateKey, "privateKey")\n self.bufferSize = check.int_param(bufferSize, "bufferSize")\n super().__init__("Kinesis", name)
\n\n\n
[docs]class AzureBlobStorageDestination(GeneratedAirbyteDestination):\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(self, flattening: str):\n self.format_type = "CSV"\n self.flattening = check.str_param(flattening, "flattening")
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n ):\n self.format_type = "JSONL"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_account_key: str,\n format: Union[\n "AzureBlobStorageDestination.CSVCommaSeparatedValues",\n "AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON",\n ],\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n azure_blob_storage_container_name: Optional[str] = None,\n azure_blob_storage_output_buffer_size: Optional[int] = None,\n ):\n """Airbyte Destination for Azure Blob Storage.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage\n\n Args:\n name (str): The name of the destination.\n azure_blob_storage_endpoint_domain_name (Optional[str]): This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.\n azure_blob_storage_container_name (Optional[str]): The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp\n azure_blob_storage_account_name (str): The account's name of the Azure Blob Storage.\n azure_blob_storage_account_key (str): The Azure blob storage account key.\n azure_blob_storage_output_buffer_size (Optional[int]): The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.\n format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]): Output data format\n """\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_container_name = check.opt_str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_account_key = check.str_param(\n azure_blob_storage_account_key, "azure_blob_storage_account_key"\n )\n self.azure_blob_storage_output_buffer_size = check.opt_int_param(\n azure_blob_storage_output_buffer_size, "azure_blob_storage_output_buffer_size"\n )\n self.format = check.inst_param(\n format,\n "format",\n (\n AzureBlobStorageDestination.CSVCommaSeparatedValues,\n AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n super().__init__("Azure Blob Storage", name)
\n\n\n
[docs]class KafkaDestination(GeneratedAirbyteDestination):\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n bootstrap_servers: str,\n topic_pattern: str,\n protocol: Union[\n "KafkaDestination.PLAINTEXT",\n "KafkaDestination.SASLPLAINTEXT",\n "KafkaDestination.SASLSSL",\n ],\n acks: str,\n enable_idempotence: bool,\n compression_type: str,\n batch_size: int,\n linger_ms: str,\n max_in_flight_requests_per_connection: int,\n client_dns_lookup: str,\n buffer_memory: str,\n max_request_size: int,\n retries: int,\n socket_connection_setup_timeout_ms: str,\n socket_connection_setup_timeout_max_ms: str,\n max_block_ms: str,\n request_timeout_ms: int,\n delivery_timeout_ms: int,\n send_buffer_bytes: int,\n receive_buffer_bytes: int,\n test_topic: Optional[str] = None,\n sync_producer: Optional[bool] = None,\n client_id: Optional[str] = None,\n ):\n """Airbyte Destination for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka\n\n Args:\n name (str): The name of the destination.\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n test_topic (Optional[str]): Topic to test if Airbyte can produce messages.\n sync_producer (Optional[bool]): Wait synchronously until the record has been sent to Kafka.\n protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]): Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n acks (str): The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.\n enable_idempotence (bool): When set to 'true', the producer will ensure that exactly one copy of each message is written in the stream. If 'false', producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.\n compression_type (str): The compression type for all data generated by the producer.\n batch_size (int): The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.\n linger_ms (str): The producer groups together any records that arrive in between request transmissions into a single batched request.\n max_in_flight_requests_per_connection (int): The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.\n client_dns_lookup (str): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n buffer_memory (str): The total bytes of memory the producer can use to buffer records waiting to be sent to the server.\n max_request_size (int): The maximum size of a request in bytes.\n retries (int): Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.\n socket_connection_setup_timeout_ms (str): The amount of time the client will wait for the socket connection to be established.\n socket_connection_setup_timeout_max_ms (str): The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.\n max_block_ms (str): The configuration controls how long the KafkaProducer's send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.\n request_timeout_ms (int): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n delivery_timeout_ms (int): An upper bound on the time to report success or failure after a call to 'send()' returns.\n send_buffer_bytes (int): The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.\n receive_buffer_bytes (int): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n """\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.sync_producer = check.opt_bool_param(sync_producer, "sync_producer")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.acks = check.str_param(acks, "acks")\n self.enable_idempotence = check.bool_param(enable_idempotence, "enable_idempotence")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.batch_size = check.int_param(batch_size, "batch_size")\n self.linger_ms = check.str_param(linger_ms, "linger_ms")\n self.max_in_flight_requests_per_connection = check.int_param(\n max_in_flight_requests_per_connection, "max_in_flight_requests_per_connection"\n )\n self.client_dns_lookup = check.str_param(client_dns_lookup, "client_dns_lookup")\n self.buffer_memory = check.str_param(buffer_memory, "buffer_memory")\n self.max_request_size = check.int_param(max_request_size, "max_request_size")\n self.retries = check.int_param(retries, "retries")\n self.socket_connection_setup_timeout_ms = check.str_param(\n socket_connection_setup_timeout_ms, "socket_connection_setup_timeout_ms"\n )\n self.socket_connection_setup_timeout_max_ms = check.str_param(\n socket_connection_setup_timeout_max_ms, "socket_connection_setup_timeout_max_ms"\n )\n self.max_block_ms = check.str_param(max_block_ms, "max_block_ms")\n self.request_timeout_ms = check.int_param(request_timeout_ms, "request_timeout_ms")\n self.delivery_timeout_ms = check.int_param(delivery_timeout_ms, "delivery_timeout_ms")\n self.send_buffer_bytes = check.int_param(send_buffer_bytes, "send_buffer_bytes")\n self.receive_buffer_bytes = check.int_param(receive_buffer_bytes, "receive_buffer_bytes")\n super().__init__("Kafka", name)
\n\n\n
[docs]class ElasticsearchDestination(GeneratedAirbyteDestination):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchDestination.None_",\n "ElasticsearchDestination.ApiKeySecret",\n "ElasticsearchDestination.UsernamePassword",\n ],\n upsert: Optional[bool] = None,\n ):\n r"""Airbyte Destination for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n upsert (Optional[bool]): If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.\n authenticationMethod (Union[ElasticsearchDestination.None\\\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.upsert = check.opt_bool_param(upsert, "upsert")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchDestination.None_,\n ElasticsearchDestination.ApiKeySecret,\n ElasticsearchDestination.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class MysqlDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mysql", name)
\n\n\n
[docs]class SftpJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n username: str,\n password: str,\n destination_path: str,\n port: Optional[int] = None,\n ):\n """Airbyte Destination for Sftp Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the SFTP server.\n port (Optional[int]): Port of the SFTP server.\n username (str): Username to use to access the SFTP server.\n password (str): Password associated with the username.\n destination_path (str): Path to the directory where json files will be written.\n """\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sftp Json", name)
\n\n\n
[docs]class GcsDestination(GeneratedAirbyteDestination):\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, credential_type: str, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = check.str_param(credential_type, "credential_type")\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self,\n codec: str,\n compression_level: Optional[int] = None,\n include_checksum: Optional[bool] = None,\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "GcsDestination.NoCompression",\n "GcsDestination.Deflate",\n "GcsDestination.Bzip2",\n "GcsDestination.Xz",\n "GcsDestination.Zstandard",\n "GcsDestination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n GcsDestination.NoCompression,\n GcsDestination.Deflate,\n GcsDestination.Bzip2,\n GcsDestination.Xz,\n GcsDestination.Zstandard,\n GcsDestination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n flattening: Optional[str] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.opt_str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n credential: "GcsDestination.HMACKey",\n format: Union[\n "GcsDestination.AvroApacheAvro",\n "GcsDestination.CSVCommaSeparatedValues",\n "GcsDestination.JSONLinesNewlineDelimitedJSON",\n "GcsDestination.ParquetColumnarStorage",\n ],\n gcs_bucket_region: Optional[str] = None,\n ):\n """Airbyte Destination for Gcs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs\n\n Args:\n name (str): The name of the destination.\n gcs_bucket_name (str): You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.\n gcs_bucket_path (str): GCS Bucket Path string Subdirectory under the above bucket to sync the data into.\n gcs_bucket_region (Optional[str]): Select a Region of the GCS Bucket. Read more here.\n credential (GcsDestination.HMACKey): An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.\n format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]): Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.\n """\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.gcs_bucket_region = check.opt_str_param(gcs_bucket_region, "gcs_bucket_region")\n self.credential = check.inst_param(credential, "credential", GcsDestination.HMACKey)\n self.format = check.inst_param(\n format,\n "format",\n (\n GcsDestination.AvroApacheAvro,\n GcsDestination.CSVCommaSeparatedValues,\n GcsDestination.JSONLinesNewlineDelimitedJSON,\n GcsDestination.ParquetColumnarStorage,\n ),\n )\n super().__init__("Gcs", name)
\n\n\n
[docs]class CassandraDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n datacenter: Optional[str] = None,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Cassandra.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Cassandra keyspace to create data in.\n username (str): Username to use to access Cassandra.\n password (str): Password associated with Cassandra.\n address (str): Address to connect to.\n port (int): Port of Cassandra.\n datacenter (Optional[str]): Datacenter of the cassandra cluster.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.datacenter = check.opt_str_param(datacenter, "datacenter")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Cassandra", name)
\n\n\n
[docs]class FireboltDestination(GeneratedAirbyteDestination):\n
[docs] class SQLInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "SQL"
\n\n
[docs] class ExternalTableViaS3:\n
[docs] @public\n def __init__(self, s3_bucket: str, s3_region: str, aws_key_id: str, aws_key_secret: str):\n self.method = "S3"\n self.s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self.s3_region = check.str_param(s3_region, "s3_region")\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_key_secret = check.str_param(aws_key_secret, "aws_key_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n loading_method: Union[\n "FireboltDestination.SQLInserts", "FireboltDestination.ExternalTableViaS3"\n ],\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Destination for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]): Loading method used to select the way data will be uploaded to Firebolt\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3),\n )\n super().__init__("Firebolt", name)
\n\n\n
[docs]class GoogleSheetsDestination(GeneratedAirbyteDestination):\n
[docs] class AuthenticationViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: "GoogleSheetsDestination.AuthenticationViaGoogleOAuth",\n ):\n """Airbyte Destination for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): The link to your spreadsheet. See this guide for more details.\n credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth): Google API Credentials for connecting to Google Sheets and Google Drive APIs\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleSheetsDestination.AuthenticationViaGoogleOAuth\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DatabricksDestination(GeneratedAirbyteDestination):\n
[docs] class AmazonS3:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n s3_access_key_id: str,\n s3_secret_access_key: str,\n file_name_pattern: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.s3_access_key_id = check.str_param(s3_access_key_id, "s3_access_key_id")\n self.s3_secret_access_key = check.str_param(\n s3_secret_access_key, "s3_secret_access_key"\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class AzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n accept_terms: bool,\n databricks_server_hostname: str,\n databricks_http_path: str,\n databricks_personal_access_token: str,\n data_source: Union[\n "DatabricksDestination.AmazonS3", "DatabricksDestination.AzureBlobStorage"\n ],\n databricks_port: Optional[str] = None,\n database_schema: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n """Airbyte Destination for Databricks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks\n\n Args:\n name (str): The name of the destination.\n accept_terms (bool): You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.\n databricks_server_hostname (str): Databricks Cluster Server Hostname.\n databricks_http_path (str): Databricks Cluster HTTP Path.\n databricks_port (Optional[str]): Databricks Cluster Port.\n databricks_personal_access_token (str): Databricks Personal Access Token for making authenticated requests.\n database_schema (Optional[str]): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]): Storage on which the delta lake is built.\n purge_staging_data (Optional[bool]): Default to 'true'. Switch it to 'false' for debugging purpose.\n """\n self.accept_terms = check.bool_param(accept_terms, "accept_terms")\n self.databricks_server_hostname = check.str_param(\n databricks_server_hostname, "databricks_server_hostname"\n )\n self.databricks_http_path = check.str_param(databricks_http_path, "databricks_http_path")\n self.databricks_port = check.opt_str_param(databricks_port, "databricks_port")\n self.databricks_personal_access_token = check.str_param(\n databricks_personal_access_token, "databricks_personal_access_token"\n )\n self.database_schema = check.opt_str_param(database_schema, "database_schema")\n self.data_source = check.inst_param(\n data_source,\n "data_source",\n (DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage),\n )\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n super().__init__("Databricks", name)
\n\n\n
[docs]class BigqueryDenormalizedDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDenormalizedDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDenormalizedDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDenormalizedDestination.StandardInserts",\n "BigqueryDenormalizedDestination.GCSStaging",\n ],\n credentials_json: Optional[str] = None,\n dataset_location: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery Denormalized.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n dataset_location (Optional[str]): The location of the dataset. Warning: Changes made after creation will not be applied. The default "US" value is used if not set explicitly. Read more here.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n BigqueryDenormalizedDestination.StandardInserts,\n BigqueryDenormalizedDestination.GCSStaging,\n ),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.dataset_location = check.opt_str_param(dataset_location, "dataset_location")\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery Denormalized", name)
\n\n\n
[docs]class SqliteDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Sqlite.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sqlite", name)
\n\n\n
[docs]class MongodbDestination(GeneratedAirbyteDestination):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.authorization = "none"
\n\n
[docs] class LoginPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.authorization = "login/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbDestination.StandaloneMongoDbInstance",\n "MongodbDestination.ReplicaSet",\n "MongodbDestination.MongoDBAtlas",\n ],\n database: str,\n auth_type: Union["MongodbDestination.None_", "MongodbDestination.LoginPassword"],\n ):\n r"""Airbyte Destination for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbDestination.StandaloneMongoDbInstance, MongodbDestination.ReplicaSet, MongodbDestination.MongoDBAtlas]): MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): Name of the database.\n auth_type (Union[MongodbDestination.None\\\\_, MongodbDestination.LoginPassword]): Authorization type.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbDestination.StandaloneMongoDbInstance,\n MongodbDestination.ReplicaSet,\n MongodbDestination.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.auth_type = check.inst_param(\n auth_type, "auth_type", (MongodbDestination.None_, MongodbDestination.LoginPassword)\n )\n super().__init__("Mongodb", name)
\n\n\n
[docs]class RocksetDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, api_key: str, workspace: str, api_server: Optional[str] = None):\n """Airbyte Destination for Rockset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset\n\n Args:\n name (str): The name of the destination.\n api_key (str): Rockset api key\n workspace (str): The Rockset workspace in which collections will be created + written to.\n api_server (Optional[str]): Rockset api URL\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.workspace = check.str_param(workspace, "workspace")\n self.api_server = check.opt_str_param(api_server, "api_server")\n super().__init__("Rockset", name)
\n\n\n
[docs]class OracleDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n sid: str,\n username: str,\n encryption: Union[\n "OracleDestination.Unencrypted",\n "OracleDestination.NativeNetworkEncryptionNNE",\n "OracleDestination.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n sid (str): The System Identifier uniquely distinguishes the instance from any other instance on the same computer.\n username (str): The username to access the database. This user must have CREATE USER privileges in the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n schema (Optional[str]): The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is "airbyte". In Oracle, schemas and users are the same thing, so the "user" parameter is used as the login credentials and this is used for the default Airbyte message schema.\n encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.sid = check.str_param(sid, "sid")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.schema = check.opt_str_param(schema, "schema")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleDestination.Unencrypted,\n OracleDestination.NativeNetworkEncryptionNNE,\n OracleDestination.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class CsvDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Csv.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where csv files will be written. The destination uses the local mount "/local" and any data files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Csv", name)
\n\n\n
[docs]class S3Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "S3Destination.NoCompression",\n "S3Destination.Deflate",\n "S3Destination.Bzip2",\n "S3Destination.Xz",\n "S3Destination.Zstandard",\n "S3Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n S3Destination.NoCompression,\n S3Destination.Deflate,\n S3Destination.Bzip2,\n S3Destination.Xz,\n S3Destination.Zstandard,\n S3Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n format: Union[\n "S3Destination.AvroApacheAvro",\n "S3Destination.CSVCommaSeparatedValues",\n "S3Destination.JSONLinesNewlineDelimitedJSON",\n "S3Destination.ParquetColumnarStorage",\n ],\n access_key_id: Optional[str] = None,\n secret_access_key: Optional[str] = None,\n s3_endpoint: Optional[str] = None,\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3\n\n Args:\n name (str): The name of the destination.\n access_key_id (Optional[str]): The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (Optional[str]): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the S3 bucket. Read more here.\n s3_bucket_path (str): Directory under the S3 bucket where data will be written. Read more here\n s3_bucket_region (str): The region of the S3 bucket. See here for all region codes.\n format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]): Format of the data output. See here for more details\n s3_endpoint (Optional[str]): Your S3 endpoint url. Read more here\n s3_path_format (Optional[str]): Format string on how data will be organized inside the S3 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the S3 staging file(s)\n """\n self.access_key_id = check.opt_str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.opt_str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.format = check.inst_param(\n format,\n "format",\n (\n S3Destination.AvroApacheAvro,\n S3Destination.CSVCommaSeparatedValues,\n S3Destination.JSONLinesNewlineDelimitedJSON,\n S3Destination.ParquetColumnarStorage,\n ),\n )\n self.s3_endpoint = check.opt_str_param(s3_endpoint, "s3_endpoint")\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("S3", name)
\n\n\n
[docs]class AwsDatalakeDestination(GeneratedAirbyteDestination):\n
[docs] class IAMRole:\n
[docs] @public\n def __init__(self, role_arn: str):\n self.credentials_title = "IAM Role"\n self.role_arn = check.str_param(role_arn, "role_arn")
\n\n
[docs] class IAMUser:\n
[docs] @public\n def __init__(self, aws_access_key_id: str, aws_secret_access_key: str):\n self.credentials_title = "IAM User"\n self.aws_access_key_id = check.str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n credentials: Union["AwsDatalakeDestination.IAMRole", "AwsDatalakeDestination.IAMUser"],\n bucket_name: str,\n bucket_prefix: str,\n aws_account_id: Optional[str] = None,\n lakeformation_database_name: Optional[str] = None,\n ):\n """Airbyte Destination for Aws Datalake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake\n\n Args:\n name (str): The name of the destination.\n aws_account_id (Optional[str]): target aws account id\n region (str): Region name\n credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]): Choose How to Authenticate to AWS.\n bucket_name (str): Name of the bucket\n bucket_prefix (str): S3 prefix\n lakeformation_database_name (Optional[str]): Which database to use\n """\n self.aws_account_id = check.opt_str_param(aws_account_id, "aws_account_id")\n self.region = check.str_param(region, "region")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser),\n )\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.bucket_prefix = check.str_param(bucket_prefix, "bucket_prefix")\n self.lakeformation_database_name = check.opt_str_param(\n lakeformation_database_name, "lakeformation_database_name"\n )\n super().__init__("Aws Datalake", name)
\n\n\n
[docs]class MssqlDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_method: Union[\n "MssqlDestination.Unencrypted",\n "MssqlDestination.EncryptedTrustServerCertificate",\n "MssqlDestination.EncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the MSSQL database.\n port (int): The port of the MSSQL database.\n database (str): The name of the MSSQL database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]): The encryption method which is used to communicate with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlDestination.Unencrypted,\n MssqlDestination.EncryptedTrustServerCertificate,\n MssqlDestination.EncryptedVerifyCertificate,\n ),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class PubsubDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, topic_id: str, credentials_json: str):\n """Airbyte Destination for Pubsub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target PubSub.\n topic_id (str): The PubSub topic ID in the given GCP project ID.\n credentials_json (str): The contents of the JSON service account key. Check out the docs if you need help generating this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.topic_id = check.str_param(topic_id, "topic_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Pubsub", name)
\n\n\n
[docs]class R2Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "R2Destination.NoCompression",\n "R2Destination.Deflate",\n "R2Destination.Bzip2",\n "R2Destination.Xz",\n "R2Destination.Zstandard",\n "R2Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n R2Destination.NoCompression,\n R2Destination.Deflate,\n R2Destination.Bzip2,\n R2Destination.Xz,\n R2Destination.Zstandard,\n R2Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n access_key_id: str,\n secret_access_key: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n format: Union[\n "R2Destination.AvroApacheAvro",\n "R2Destination.CSVCommaSeparatedValues",\n "R2Destination.JSONLinesNewlineDelimitedJSON",\n ],\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for R2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2\n\n Args:\n name (str): The name of the destination.\n account_id (str): Cloudflare account ID\n access_key_id (str): The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (str): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the R2 bucket. Read more here.\n s3_bucket_path (str): Directory under the R2 bucket where data will be written.\n format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]): Format of the data output. See here for more details\n s3_path_format (Optional[str]): Format string on how data will be organized inside the R2 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the R2 staging file(s)\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.format = check.inst_param(\n format,\n "format",\n (\n R2Destination.AvroApacheAvro,\n R2Destination.CSVCommaSeparatedValues,\n R2Destination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("R2", name)
\n\n\n
[docs]class JdbcDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted url. See the standard here.\n schema (Optional[str]): If you leave the schema unspecified, JDBC defaults to a schema named "public".\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.schema = check.opt_str_param(schema, "schema")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class KeenDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, api_key: str, infer_timestamp: Optional[bool] = None\n ):\n """Airbyte Destination for Keen.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen\n\n Args:\n name (str): The name of the destination.\n project_id (str): To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n api_key (str): To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n infer_timestamp (Optional[bool]): Allow connector to guess keen.timestamp value based on the streamed data.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.api_key = check.str_param(api_key, "api_key")\n self.infer_timestamp = check.opt_bool_param(infer_timestamp, "infer_timestamp")\n super().__init__("Keen", name)
\n\n\n
[docs]class TidbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Tidb", name)
\n\n\n
[docs]class FirestoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, credentials_json: Optional[str] = None):\n """Airbyte Destination for Firestore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n super().__init__("Firestore", name)
\n\n\n
[docs]class ScyllaDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Scylla.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Scylla keyspace to create data in.\n username (str): Username to use to access Scylla.\n password (str): Password associated with Scylla.\n address (str): Address to connect to.\n port (int): Port of Scylla.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Scylla", name)
\n\n\n
[docs]class RedisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, host: str, port: int, username: str, password: str, cache_type: str\n ):\n """Airbyte Destination for Redis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis\n\n Args:\n name (str): The name of the destination.\n host (str): Redis host to connect to.\n port (int): Port of Redis.\n username (str): Username associated with Redis.\n password (str): Password associated with Redis.\n cache_type (str): Redis cache type to store data in.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.cache_type = check.str_param(cache_type, "cache_type")\n super().__init__("Redis", name)
\n\n\n
[docs]class MqttDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n broker_host: str,\n broker_port: int,\n use_tls: bool,\n topic_pattern: str,\n publisher_sync: bool,\n connect_timeout: int,\n automatic_reconnect: bool,\n clean_session: bool,\n message_retained: bool,\n message_qos: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n topic_test: Optional[str] = None,\n client: Optional[str] = None,\n ):\n """Airbyte Destination for Mqtt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt\n\n Args:\n name (str): The name of the destination.\n broker_host (str): Host of the broker to connect to.\n broker_port (int): Port of the broker.\n use_tls (bool): Whether to use TLS encryption on the connection.\n username (Optional[str]): User name to use for the connection.\n password (Optional[str]): Password to use for the connection.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n client (Optional[str]): A client identifier that is unique on the server being connected to.\n publisher_sync (bool): Wait synchronously until the record has been sent to the broker.\n connect_timeout (int): Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.\n automatic_reconnect (bool): Whether the client will automatically attempt to reconnect to the server if the connection is lost.\n clean_session (bool): Whether the client and server should remember state across restarts and reconnects.\n message_retained (bool): Whether or not the publish message should be retained by the messaging engine.\n message_qos (str): Quality of service used for each message to be delivered.\n """\n self.broker_host = check.str_param(broker_host, "broker_host")\n self.broker_port = check.int_param(broker_port, "broker_port")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.client = check.opt_str_param(client, "client")\n self.publisher_sync = check.bool_param(publisher_sync, "publisher_sync")\n self.connect_timeout = check.int_param(connect_timeout, "connect_timeout")\n self.automatic_reconnect = check.bool_param(automatic_reconnect, "automatic_reconnect")\n self.clean_session = check.bool_param(clean_session, "clean_session")\n self.message_retained = check.bool_param(message_retained, "message_retained")\n self.message_qos = check.str_param(message_qos, "message_qos")\n super().__init__("Mqtt", name)
\n\n\n
[docs]class RedshiftDestination(GeneratedAirbyteDestination):\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class S3Staging:\n
[docs] @public\n def __init__(\n self,\n s3_bucket_name: str,\n s3_bucket_region: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "RedshiftDestination.NoEncryption", "RedshiftDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_path: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n self.method = "S3 Staging"\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.opt_str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (RedshiftDestination.NoEncryption, RedshiftDestination.AESCBCEnvelopeEncryption),\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n username: str,\n password: str,\n database: str,\n schema: str,\n uploading_method: Union["RedshiftDestination.Standard", "RedshiftDestination.S3Staging"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)\n port (int): Port of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]): The method how the data will be uploaded to the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.uploading_method = check.inst_param(\n uploading_method,\n "uploading_method",\n (RedshiftDestination.Standard, RedshiftDestination.S3Staging),\n )\n super().__init__("Redshift", name)
\n\n\n
[docs]class PulsarDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n brokers: str,\n use_tls: bool,\n topic_type: str,\n topic_tenant: str,\n topic_namespace: str,\n topic_pattern: str,\n compression_type: str,\n send_timeout_ms: int,\n max_pending_messages: int,\n max_pending_messages_across_partitions: int,\n batching_enabled: bool,\n batching_max_messages: int,\n batching_max_publish_delay: int,\n block_if_queue_full: bool,\n topic_test: Optional[str] = None,\n producer_name: Optional[str] = None,\n producer_sync: Optional[bool] = None,\n ):\n """Airbyte Destination for Pulsar.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar\n\n Args:\n name (str): The name of the destination.\n brokers (str): A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.\n use_tls (bool): Whether to use TLS encryption on the connection.\n topic_type (str): It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.\n topic_tenant (str): The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.\n topic_namespace (str): The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n producer_name (Optional[str]): Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.\n producer_sync (Optional[bool]): Wait synchronously until the record has been sent to Pulsar.\n compression_type (str): Compression type for the producer.\n send_timeout_ms (int): If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).\n max_pending_messages (int): The maximum size of a queue holding pending messages.\n max_pending_messages_across_partitions (int): The maximum number of pending messages across partitions.\n batching_enabled (bool): Control whether automatic batching of messages is enabled for the producer.\n batching_max_messages (int): Maximum number of messages permitted in a batch.\n batching_max_publish_delay (int): Time period in milliseconds within which the messages sent will be batched.\n block_if_queue_full (bool): If the send operation should block when the outgoing message queue is full.\n """\n self.brokers = check.str_param(brokers, "brokers")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.topic_type = check.str_param(topic_type, "topic_type")\n self.topic_tenant = check.str_param(topic_tenant, "topic_tenant")\n self.topic_namespace = check.str_param(topic_namespace, "topic_namespace")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.producer_name = check.opt_str_param(producer_name, "producer_name")\n self.producer_sync = check.opt_bool_param(producer_sync, "producer_sync")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.send_timeout_ms = check.int_param(send_timeout_ms, "send_timeout_ms")\n self.max_pending_messages = check.int_param(max_pending_messages, "max_pending_messages")\n self.max_pending_messages_across_partitions = check.int_param(\n max_pending_messages_across_partitions, "max_pending_messages_across_partitions"\n )\n self.batching_enabled = check.bool_param(batching_enabled, "batching_enabled")\n self.batching_max_messages = check.int_param(batching_max_messages, "batching_max_messages")\n self.batching_max_publish_delay = check.int_param(\n batching_max_publish_delay, "batching_max_publish_delay"\n )\n self.block_if_queue_full = check.bool_param(block_if_queue_full, "block_if_queue_full")\n super().__init__("Pulsar", name)
\n\n\n
[docs]class SnowflakeDestination(GeneratedAirbyteDestination):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class KeyPairAuthentication:\n
[docs] @public\n def __init__(\n self,\n private_key: str,\n auth_type: Optional[str] = None,\n private_key_password: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.private_key = check.str_param(private_key, "private_key")\n self.private_key_password = check.opt_str_param(\n private_key_password, "private_key_password"\n )
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, password: str):\n self.password = check.str_param(password, "password")
\n\n
[docs] class SelectAnotherOption:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class RecommendedInternalStaging:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class AWSS3Staging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n s3_bucket_name: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "SnowflakeDestination.NoEncryption", "SnowflakeDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_region: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n file_name_pattern: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_region = check.opt_str_param(s3_bucket_region, "s3_bucket_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (SnowflakeDestination.NoEncryption, SnowflakeDestination.AESCBCEnvelopeEncryption),\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class GoogleCloudStorageStaging:\n
[docs] @public\n def __init__(self, method: str, project_id: str, bucket_name: str, credentials_json: str):\n self.method = check.str_param(method, "method")\n self.project_id = check.str_param(project_id, "project_id")\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] class AzureBlobStorageStaging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n username: str,\n credentials: Union[\n "SnowflakeDestination.OAuth20",\n "SnowflakeDestination.KeyPairAuthentication",\n "SnowflakeDestination.UsernameAndPassword",\n ],\n loading_method: Union[\n "SnowflakeDestination.SelectAnotherOption",\n "SnowflakeDestination.RecommendedInternalStaging",\n "SnowflakeDestination.AWSS3Staging",\n "SnowflakeDestination.GoogleCloudStorageStaging",\n "SnowflakeDestination.AzureBlobStorageStaging",\n ],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): Enter your Snowflake account's locator (in the format ...snowflakecomputing.com)\n role (str): Enter the role that you want to use to access Snowflake\n warehouse (str): Enter the name of the warehouse that you want to sync data into\n database (str): Enter the name of the database you want to sync data into\n schema (str): Enter the name of the default schema\n username (str): Enter the name of the user you want to use to access the database\n jdbc_url_params (Optional[str]): Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3\n loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]): Select a data staging method\n """\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n SnowflakeDestination.OAuth20,\n SnowflakeDestination.KeyPairAuthentication,\n SnowflakeDestination.UsernameAndPassword,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n SnowflakeDestination.SelectAnotherOption,\n SnowflakeDestination.RecommendedInternalStaging,\n SnowflakeDestination.AWSS3Staging,\n SnowflakeDestination.GoogleCloudStorageStaging,\n SnowflakeDestination.AzureBlobStorageStaging,\n ),\n )\n super().__init__("Snowflake", name)
\n\n\n
[docs]class PostgresDestination(GeneratedAirbyteDestination):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(self, ca_certificate: str, client_key_password: Optional[str] = None):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: str,\n client_key: str,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.str_param(client_certificate, "client_certificate")\n self.client_key = check.str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_mode: Union[\n "PostgresDestination.Disable",\n "PostgresDestination.Allow",\n "PostgresDestination.Prefer",\n "PostgresDestination.Require",\n "PostgresDestination.VerifyCa",\n "PostgresDestination.VerifyFull",\n ],\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]): SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresDestination.Disable,\n PostgresDestination.Allow,\n PostgresDestination.Prefer,\n PostgresDestination.Require,\n PostgresDestination.VerifyCa,\n PostgresDestination.VerifyFull,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Postgres", name)
\n\n\n
[docs]class ScaffoldDestinationPythonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, TODO: Optional[str] = None):\n """Airbyte Destination for Scaffold Destination Python.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python\n\n Args:\n name (str): The name of the destination.\n TODO (Optional[str]): FIX ME\n """\n self.TODO = check.opt_str_param(TODO, "TODO")\n super().__init__("Scaffold Destination Python", name)
\n\n\n
[docs]class LocalJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Local Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Local Json", name)
\n\n\n
[docs]class MeilisearchDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, host: str, api_key: Optional[str] = None):\n """Airbyte Destination for Meilisearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the MeiliSearch instance.\n api_key (Optional[str]): MeiliSearch API Key. See the docs for more information on how to obtain this key.\n """\n self.host = check.str_param(host, "host")\n self.api_key = check.opt_str_param(api_key, "api_key")\n super().__init__("Meilisearch", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/destinations", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.destinations"}, "sources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.sources

\n# ruff: noqa: A001, A002\nfrom typing import List, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteSource\n\n\n
[docs]class StravaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n athlete_id: int,\n start_date: str,\n auth_type: Optional[str] = None,\n ):\n """Airbyte Source for Strava.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/strava\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Strava developer application.\n client_secret (str): The Client Secret of your Strava developer application.\n refresh_token (str): The Refresh Token with the activity: read_all permissions.\n athlete_id (int): The Athlete ID of your Strava developer application.\n start_date (str): UTC date and time. Any data before this date will not be replicated.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.athlete_id = check.int_param(athlete_id, "athlete_id")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Strava", name)
\n\n\n
[docs]class AppsflyerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n app_id: str,\n api_token: str,\n start_date: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Appsflyer.\n\n Args:\n name (str): The name of the destination.\n app_id (str): App identifier as found in AppsFlyer.\n api_token (str): Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.\n start_date (str): The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.\n timezone (Optional[str]): Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.\n """\n self.app_id = check.str_param(app_id, "app_id")\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n super().__init__("Appsflyer", name)
\n\n\n
[docs]class GoogleWorkspaceAdminReportsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, credentials_json: str, email: str, lookback: Optional[int] = None\n ):\n """Airbyte Source for Google Workspace Admin Reports.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports\n\n Args:\n name (str): The name of the destination.\n credentials_json (str): The contents of the JSON service account key. See the docs for more information on how to generate this key.\n email (str): The email of the user, who has permissions to access the Google Workspace Admin APIs.\n lookback (Optional[int]): Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.\n """\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")\n self.lookback = check.opt_int_param(lookback, "lookback")\n super().__init__("Google Workspace Admin Reports", name)
\n\n\n
[docs]class CartSource(GeneratedAirbyteSource):\n
[docs] class CentralAPIRouter:\n
[docs] @public\n def __init__(self, user_name: str, user_secret: str, site_id: str):\n self.auth_type = "CENTRAL_API_ROUTER"\n self.user_name = check.str_param(user_name, "user_name")\n self.user_secret = check.str_param(user_secret, "user_secret")\n self.site_id = check.str_param(site_id, "site_id")
\n\n
[docs] class SingleStoreAccessToken:\n
[docs] @public\n def __init__(self, access_token: str, store_name: str):\n self.auth_type = "SINGLE_STORE_ACCESS_TOKEN"\n self.access_token = check.str_param(access_token, "access_token")\n self.store_name = check.str_param(store_name, "store_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["CartSource.CentralAPIRouter", "CartSource.SingleStoreAccessToken"],\n start_date: str,\n ):\n """Airbyte Source for Cart.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cart\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (CartSource.CentralAPIRouter, CartSource.SingleStoreAccessToken),\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Cart", name)
\n\n\n
[docs]class LinkedinAdsSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["LinkedinAdsSource.OAuth20", "LinkedinAdsSource.AccessToken"],\n start_date: str,\n account_ids: Optional[List[int]] = None,\n ):\n """Airbyte Source for Linkedin Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2020-09-17. Any data before this date will not be replicated.\n account_ids (Optional[List[int]]): Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (LinkedinAdsSource.OAuth20, LinkedinAdsSource.AccessToken)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.account_ids = check.opt_nullable_list_param(account_ids, "account_ids", int)\n super().__init__("Linkedin Ads", name)
\n\n\n
[docs]class MongodbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n user: str,\n password: str,\n auth_source: str,\n replica_set: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb\n\n Args:\n name (str): The name of the destination.\n host (str): Host of a Mongo database to be replicated.\n port (int): Port of a Mongo database to be replicated.\n database (str): Database to be replicated.\n user (str): User\n password (str): Password\n auth_source (str): Authentication source where user information is stored. See the Mongo docs for more info.\n replica_set (Optional[str]): The name of the set to filter servers by, when connecting to a replica set (Under this condition, the 'TLS connection' value automatically becomes 'true'). See the Mongo docs for more info.\n ssl (Optional[bool]): If this switch is enabled, TLS connections will be used to connect to MongoDB.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.user = check.str_param(user, "user")\n self.password = check.str_param(password, "password")\n self.auth_source = check.str_param(auth_source, "auth_source")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Mongodb", name)
\n\n\n
[docs]class TimelySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, account_id: str, start_date: str, bearer_token: str):\n """Airbyte Source for Timely.\n\n Args:\n name (str): The name of the destination.\n account_id (str): Timely account id\n start_date (str): start date\n bearer_token (str): Timely bearer token\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.bearer_token = check.str_param(bearer_token, "bearer_token")\n super().__init__("Timely", name)
\n\n\n
[docs]class StockTickerApiTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, stock_ticker: str, api_key: str):\n """Airbyte Source for Stock Ticker Api Tutorial.\n\n Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date\n\n Args:\n name (str): The name of the destination.\n stock_ticker (str): The stock ticker to track\n api_key (str): The Polygon.io Stocks API key to use to hit the API.\n """\n self.stock_ticker = check.str_param(stock_ticker, "stock_ticker")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Stock Ticker Api Tutorial", name)
\n\n\n
[docs]class WrikeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, wrike_instance: str, start_date: Optional[str] = None\n ):\n """Airbyte Source for Wrike.\n\n Args:\n name (str): The name of the destination.\n access_token (str): Permanent access token. You can find documentation on how to acquire a permanent access token here\n wrike_instance (str): Wrike's instance such as `app-us2.wrike.com`\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.wrike_instance = check.str_param(wrike_instance, "wrike_instance")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Wrike", name)
\n\n\n
[docs]class CommercetoolsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n host: str,\n start_date: str,\n project_key: str,\n client_id: str,\n client_secret: str,\n ):\n """Airbyte Source for Commercetools.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools\n\n Args:\n name (str): The name of the destination.\n region (str): The region of the platform.\n host (str): The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n project_key (str): The project key\n client_id (str): Id of API Client.\n client_secret (str): The password of secret of API Client.\n """\n self.region = check.str_param(region, "region")\n self.host = check.str_param(host, "host")\n self.start_date = check.str_param(start_date, "start_date")\n self.project_key = check.str_param(project_key, "project_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Commercetools", name)
\n\n\n
[docs]class GutendexSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n author_year_start: Optional[str] = None,\n author_year_end: Optional[str] = None,\n copyright: Optional[str] = None,\n languages: Optional[str] = None,\n search: Optional[str] = None,\n sort: Optional[str] = None,\n topic: Optional[str] = None,\n ):\n """Airbyte Source for Gutendex.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex\n\n Args:\n name (str): The name of the destination.\n author_year_start (Optional[str]): (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n author_year_end (Optional[str]): (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n copyright (Optional[str]): (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.\n languages (Optional[str]): (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.\n search (Optional[str]): (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.\n sort (Optional[str]): (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.\n topic (Optional[str]): (Optional) Use this to search for a case-insensitive key-phrase in books' bookshelves or subjects.\n """\n self.author_year_start = check.opt_str_param(author_year_start, "author_year_start")\n self.author_year_end = check.opt_str_param(author_year_end, "author_year_end")\n self.copyright = check.opt_str_param(copyright, "copyright")\n self.languages = check.opt_str_param(languages, "languages")\n self.search = check.opt_str_param(search, "search")\n self.sort = check.opt_str_param(sort, "sort")\n self.topic = check.opt_str_param(topic, "topic")\n super().__init__("Gutendex", name)
\n\n\n
[docs]class IterableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Iterable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable\n\n Args:\n name (str): The name of the destination.\n api_key (str): Iterable API Key. See the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Iterable", name)
\n\n\n
[docs]class QuickbooksSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n realm_id: str,\n user_agent: str,\n start_date: str,\n sandbox: bool,\n ):\n """Airbyte Source for Quickbooks Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks\n\n Args:\n name (str): The name of the destination.\n client_id (str): Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n client_secret (str): Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n refresh_token (str): A token used when refreshing the access token.\n realm_id (str): Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.\n user_agent (str): Process and email for API logging purposes. Example: tap-quickbooks .\n start_date (str): The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.\n sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.realm_id = check.str_param(realm_id, "realm_id")\n self.user_agent = check.str_param(user_agent, "user_agent")\n self.start_date = check.str_param(start_date, "start_date")\n self.sandbox = check.bool_param(sandbox, "sandbox")\n super().__init__("Quickbooks Singer", name)
\n\n\n
[docs]class BigcommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, store_hash: str, access_token: str):\n """Airbyte Source for Bigcommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n store_hash (str): The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store's hash code is 'HASH_CODE'.\n access_token (str): Access Token for making authenticated requests.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.store_hash = check.str_param(store_hash, "store_hash")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Bigcommerce", name)
\n\n\n
[docs]class ShopifySource(GeneratedAirbyteSource):\n
[docs] class APIPassword:\n
[docs] @public\n def __init__(self, api_password: str):\n self.auth_method = "api_password"\n self.api_password = check.str_param(api_password, "api_password")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n credentials: Union["ShopifySource.APIPassword", "ShopifySource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Shopify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be 'NAME'.\n credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]): The authorization method to use to retrieve data from Shopify\n start_date (str): The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.\n """\n self.shop = check.str_param(shop, "shop")\n self.credentials = check.inst_param(\n credentials, "credentials", (ShopifySource.APIPassword, ShopifySource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shopify", name)
\n\n\n
[docs]class AppstoreSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, key_id: str, private_key: str, issuer_id: str, vendor: str, start_date: str\n ):\n """Airbyte Source for Appstore Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore\n\n Args:\n name (str): The name of the destination.\n key_id (str): Appstore Key ID. See the docs for more information on how to obtain this key.\n private_key (str): Appstore Private Key. See the docs for more information on how to obtain this key.\n issuer_id (str): Appstore Issuer ID. See the docs for more information on how to obtain this ID.\n vendor (str): Appstore Vendor ID. See the docs for more information on how to obtain this ID.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.key_id = check.str_param(key_id, "key_id")\n self.private_key = check.str_param(private_key, "private_key")\n self.issuer_id = check.str_param(issuer_id, "issuer_id")\n self.vendor = check.str_param(vendor, "vendor")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Appstore Singer", name)
\n\n\n
[docs]class GreenhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Greenhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse\n\n Args:\n name (str): The name of the destination.\n api_key (str): Greenhouse API Key. See the docs for more information on how to generate this key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Greenhouse", name)
\n\n\n
[docs]class ZoomSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, jwt: str):\n """Airbyte Source for Zoom Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom\n\n Args:\n name (str): The name of the destination.\n jwt (str): Zoom JWT Token. See the docs for more information on how to obtain this key.\n """\n self.jwt = check.str_param(jwt, "jwt")\n super().__init__("Zoom Singer", name)
\n\n\n
[docs]class TiktokMarketingSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self, app_id: str, secret: str, access_token: str, auth_type: Optional[str] = None\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.app_id = check.str_param(app_id, "app_id")\n self.secret = check.str_param(secret, "secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class SandboxAccessToken:\n
[docs] @public\n def __init__(self, advertiser_id: str, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.advertiser_id = check.str_param(advertiser_id, "advertiser_id")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "TiktokMarketingSource.OAuth20", "TiktokMarketingSource.SandboxAccessToken"\n ],\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n report_granularity: Optional[str] = None,\n ):\n """Airbyte Source for Tiktok Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing\n\n Args:\n name (str): The name of the destination.\n credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]): Authentication method\n start_date (Optional[str]): The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.\n report_granularity (Optional[str]): The granularity used for aggregating performance data in reports. See the docs.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken),\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.report_granularity = check.opt_str_param(report_granularity, "report_granularity")\n super().__init__("Tiktok Marketing", name)
\n\n\n
[docs]class ZendeskChatSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.credentials = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["ZendeskChatSource.OAuth20", "ZendeskChatSource.AccessToken"],\n subdomain: Optional[str] = None,\n ):\n """Airbyte Source for Zendesk Chat.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.\n subdomain (Optional[str]): Required if you access Zendesk Chat from a Zendesk Support subdomain.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskChatSource.OAuth20, ZendeskChatSource.AccessToken)\n )\n super().__init__("Zendesk Chat", name)
\n\n\n
[docs]class AwsCloudtrailSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, aws_key_id: str, aws_secret_key: str, aws_region_name: str, start_date: str\n ):\n """Airbyte Source for Aws Cloudtrail.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail\n\n Args:\n name (str): The name of the destination.\n aws_key_id (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_secret_key (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_region_name (str): The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.\n start_date (str): The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.\n """\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.aws_region_name = check.str_param(aws_region_name, "aws_region_name")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Aws Cloudtrail", name)
\n\n\n
[docs]class OktaSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["OktaSource.OAuth20", "OktaSource.APIToken"],\n domain: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Okta.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/okta\n\n Args:\n name (str): The name of the destination.\n domain (Optional[str]): The Okta domain. See the docs for instructions on how to find it.\n start_date (Optional[str]): UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.\n """\n self.domain = check.opt_str_param(domain, "domain")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (OktaSource.OAuth20, OktaSource.APIToken)\n )\n super().__init__("Okta", name)
\n\n\n
[docs]class InsightlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: Optional[str] = None, start_date: Optional[str] = None):\n """Airbyte Source for Insightly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly\n\n Args:\n name (str): The name of the destination.\n token (Optional[str]): Your Insightly API token.\n start_date (Optional[str]): The date from which you'd like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.\n """\n self.token = check.opt_str_param(token, "token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Insightly", name)
\n\n\n
[docs]class LinkedinPagesSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n org_id: int,\n credentials: Union["LinkedinPagesSource.OAuth20", "LinkedinPagesSource.AccessToken"],\n ):\n """Airbyte Source for Linkedin Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/\n\n Args:\n name (str): The name of the destination.\n org_id (int): Specify the Organization ID\n """\n self.org_id = check.int_param(org_id, "org_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (LinkedinPagesSource.OAuth20, LinkedinPagesSource.AccessToken),\n )\n super().__init__("Linkedin Pages", name)
\n\n\n
[docs]class PersistiqSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Persistiq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq\n\n Args:\n name (str): The name of the destination.\n api_key (str): PersistIq API Key. See the docs for more information on where to find that key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Persistiq", name)
\n\n\n
[docs]class FreshcallerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n start_date: str,\n requests_per_minute: Optional[int] = None,\n sync_lag_minutes: Optional[int] = None,\n ):\n """Airbyte Source for Freshcaller.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller\n\n Args:\n name (str): The name of the destination.\n domain (str): Used to construct Base URL for the Freshcaller APIs\n api_key (str): Freshcaller API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (str): UTC date and time. Any data created after this date will be replicated.\n sync_lag_minutes (Optional[int]): Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.str_param(start_date, "start_date")\n self.sync_lag_minutes = check.opt_int_param(sync_lag_minutes, "sync_lag_minutes")\n super().__init__("Freshcaller", name)
\n\n\n
[docs]class AppfollowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, ext_id: str, cid: str, api_secret: str, country: str):\n """Airbyte Source for Appfollow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow\n\n Args:\n name (str): The name of the destination.\n ext_id (str): for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;\n cid (str): client id provided by Appfollow\n api_secret (str): api secret provided by Appfollow\n country (str): getting data by Country\n """\n self.ext_id = check.str_param(ext_id, "ext_id")\n self.cid = check.str_param(cid, "cid")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.country = check.str_param(country, "country")\n super().__init__("Appfollow", name)
\n\n\n
[docs]class FacebookPagesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, access_token: str, page_id: str):\n """Airbyte Source for Facebook Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages\n\n Args:\n name (str): The name of the destination.\n access_token (str): Facebook Page Access Token\n page_id (str): Page ID\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.page_id = check.str_param(page_id, "page_id")\n super().__init__("Facebook Pages", name)
\n\n\n
[docs]class JiraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n domain: str,\n email: str,\n projects: Optional[List[str]] = None,\n start_date: Optional[str] = None,\n additional_fields: Optional[List[str]] = None,\n expand_issue_changelog: Optional[bool] = None,\n render_fields: Optional[bool] = None,\n enable_experimental_streams: Optional[bool] = None,\n ):\n """Airbyte Source for Jira.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/jira\n\n Args:\n name (str): The name of the destination.\n api_token (str): Jira API Token. See the docs for more information on how to generate this key.\n domain (str): The Domain for your Jira account, e.g. airbyteio.atlassian.net\n email (str): The user email for your Jira account.\n projects (Optional[List[str]]): List of Jira project keys to replicate data for.\n start_date (Optional[str]): The date from which you'd like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.\n additional_fields (Optional[List[str]]): List of additional fields to include in replicating issues.\n expand_issue_changelog (Optional[bool]): Expand the changelog when replicating issues.\n render_fields (Optional[bool]): Render issue fields in HTML format in addition to Jira JSON-like format.\n enable_experimental_streams (Optional[bool]): Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain = check.str_param(domain, "domain")\n self.email = check.str_param(email, "email")\n self.projects = check.opt_nullable_list_param(projects, "projects", str)\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.additional_fields = check.opt_nullable_list_param(\n additional_fields, "additional_fields", str\n )\n self.expand_issue_changelog = check.opt_bool_param(\n expand_issue_changelog, "expand_issue_changelog"\n )\n self.render_fields = check.opt_bool_param(render_fields, "render_fields")\n self.enable_experimental_streams = check.opt_bool_param(\n enable_experimental_streams, "enable_experimental_streams"\n )\n super().__init__("Jira", name)
\n\n\n
[docs]class GoogleSheetsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: Union[\n "GoogleSheetsSource.AuthenticateViaGoogleOAuth",\n "GoogleSheetsSource.ServiceAccountKeyAuthentication",\n ],\n row_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): Enter the link to the Google spreadsheet you want to sync\n row_batch_size (Optional[int]): Number of rows fetched when making a Google Sheet API call. Defaults to 200.\n credentials (Union[GoogleSheetsSource.AuthenticateViaGoogleOAuth, GoogleSheetsSource.ServiceAccountKeyAuthentication]): Credentials for connecting to the Google Sheets API\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.row_batch_size = check.opt_int_param(row_batch_size, "row_batch_size")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleSheetsSource.AuthenticateViaGoogleOAuth,\n GoogleSheetsSource.ServiceAccountKeyAuthentication,\n ),\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DockerhubSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, docker_username: str):\n """Airbyte Source for Dockerhub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub\n\n Args:\n name (str): The name of the destination.\n docker_username (str): Username of DockerHub person or organization (for https://hub.docker.com/v2/repositories/USERNAME/ API call)\n """\n self.docker_username = check.str_param(docker_username, "docker_username")\n super().__init__("Dockerhub", name)
\n\n\n
[docs]class UsCensusSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, query_path: str, api_key: str, query_params: Optional[str] = None\n ):\n """Airbyte Source for Us Census.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census\n\n Args:\n name (str): The name of the destination.\n query_params (Optional[str]): The query parameters portion of the GET request, without the api key\n query_path (str): The path portion of the GET request\n api_key (str): Your API Key. Get your key here.\n """\n self.query_params = check.opt_str_param(query_params, "query_params")\n self.query_path = check.str_param(query_path, "query_path")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Us Census", name)
\n\n\n
[docs]class KustomerSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, start_date: str):\n """Airbyte Source for Kustomer Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer\n\n Args:\n name (str): The name of the destination.\n api_token (str): Kustomer API Token. See the docs on how to obtain this\n start_date (str): The date from which you'd like to replicate the data\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Kustomer Singer", name)
\n\n\n
[docs]class AzureTableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n storage_account_name: str,\n storage_access_key: str,\n storage_endpoint_suffix: Optional[str] = None,\n ):\n """Airbyte Source for Azure Table.\n\n Args:\n name (str): The name of the destination.\n storage_account_name (str): The name of your storage account.\n storage_access_key (str): Azure Table Storage Access Key. See the docs for more information on how to obtain this key.\n storage_endpoint_suffix (Optional[str]): Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix\n """\n self.storage_account_name = check.str_param(storage_account_name, "storage_account_name")\n self.storage_access_key = check.str_param(storage_access_key, "storage_access_key")\n self.storage_endpoint_suffix = check.opt_str_param(\n storage_endpoint_suffix, "storage_endpoint_suffix"\n )\n super().__init__("Azure Table", name)
\n\n\n
[docs]class ScaffoldJavaJdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n replication_method: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Scaffold Java Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n replication_method (str): Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.replication_method = check.str_param(replication_method, "replication_method")\n super().__init__("Scaffold Java Jdbc", name)
\n\n\n
[docs]class TidbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Tidb", name)
\n\n\n
[docs]class QualarooSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n survey_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Qualaroo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo\n\n Args:\n name (str): The name of the destination.\n token (str): A Qualaroo token. See the docs for instructions on how to generate it.\n key (str): A Qualaroo token. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all surveys to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Qualaroo", name)
\n\n\n
[docs]class YahooFinancePriceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, tickers: str, interval: Optional[str] = None, range: Optional[str] = None\n ):\n """Airbyte Source for Yahoo Finance Price.\n\n Args:\n name (str): The name of the destination.\n tickers (str): Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.\n interval (Optional[str]): The interval of between prices queried.\n range (Optional[str]): The range of prices to be queried.\n """\n self.tickers = check.str_param(tickers, "tickers")\n self.interval = check.opt_str_param(interval, "interval")\n self.range = check.opt_str_param(range, "range")\n super().__init__("Yahoo Finance Price", name)
\n\n\n
[docs]class GoogleAnalyticsV4Source(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication",\n ],\n start_date: str,\n view_id: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics V4.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]): Credentials for the service\n start_date (str): The date in the format YYYY-MM-DD. Any data before this date will not be replicated.\n view_id (str): The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth,\n GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication,\n ),\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.view_id = check.str_param(view_id, "view_id")\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics V4", name)
\n\n\n
[docs]class JdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted URL. See the standard here.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class FakerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n count: int,\n seed: Optional[int] = None,\n records_per_sync: Optional[int] = None,\n records_per_slice: Optional[int] = None,\n ):\n """Airbyte Source for Faker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/faker\n\n Args:\n name (str): The name of the destination.\n count (int): How many users should be generated in total. This setting does not apply to the purchases or products stream.\n seed (Optional[int]): Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)\n records_per_sync (Optional[int]): How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.\n records_per_slice (Optional[int]): How many fake records will be in each page (stream slice), before a state message is emitted?\n """\n self.count = check.int_param(count, "count")\n self.seed = check.opt_int_param(seed, "seed")\n self.records_per_sync = check.opt_int_param(records_per_sync, "records_per_sync")\n self.records_per_slice = check.opt_int_param(records_per_slice, "records_per_slice")\n super().__init__("Faker", name)
\n\n\n
[docs]class TplcentralSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n url_base: str,\n client_id: str,\n client_secret: str,\n user_login_id: Optional[int] = None,\n user_login: Optional[str] = None,\n tpl_key: Optional[str] = None,\n customer_id: Optional[int] = None,\n facility_id: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Tplcentral.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral\n\n Args:\n name (str): The name of the destination.\n user_login_id (Optional[int]): User login ID and/or name is required\n user_login (Optional[str]): User login ID and/or name is required\n start_date (Optional[str]): Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.\n """\n self.url_base = check.str_param(url_base, "url_base")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.user_login_id = check.opt_int_param(user_login_id, "user_login_id")\n self.user_login = check.opt_str_param(user_login, "user_login")\n self.tpl_key = check.opt_str_param(tpl_key, "tpl_key")\n self.customer_id = check.opt_int_param(customer_id, "customer_id")\n self.facility_id = check.opt_int_param(facility_id, "facility_id")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Tplcentral", name)
\n\n\n
[docs]class ClickhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): The host endpoint of the Clickhouse cluster.\n port (int): The port of the database.\n database (str): The name of the database.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class FreshserviceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str, start_date: str):\n """Airbyte Source for Freshservice.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The name of your Freshservice domain\n api_key (str): Freshservice API Key. See here. The key is case sensitive.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Freshservice", name)
\n\n\n
[docs]class ZenloopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n date_from: Optional[str] = None,\n survey_id: Optional[str] = None,\n survey_group_id: Optional[str] = None,\n ):\n """Airbyte Source for Zenloop.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop\n\n Args:\n name (str): The name of the destination.\n api_token (str): Zenloop API Token. You can get the API token in settings page here\n date_from (Optional[str]): Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced\n survey_id (Optional[str]): Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys\n survey_group_id (Optional[str]): Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.date_from = check.opt_str_param(date_from, "date_from")\n self.survey_id = check.opt_str_param(survey_id, "survey_id")\n self.survey_group_id = check.opt_str_param(survey_group_id, "survey_group_id")\n super().__init__("Zenloop", name)
\n\n\n
[docs]class OracleSource(GeneratedAirbyteSource):\n
[docs] class ServiceName:\n
[docs] @public\n def __init__(self, service_name: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.service_name = check.str_param(service_name, "service_name")
\n\n
[docs] class SystemIDSID:\n
[docs] @public\n def __init__(self, sid: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.sid = check.str_param(sid, "sid")
\n\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n connection_data: Union["OracleSource.ServiceName", "OracleSource.SystemIDSID"],\n username: str,\n encryption: Union[\n "OracleSource.Unencrypted",\n "OracleSource.NativeNetworkEncryptionNNE",\n "OracleSource.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL\n connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]): Connect data that will be used for DB connection\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]): The encryption method with is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.connection_data = check.inst_param(\n connection_data, "connection_data", (OracleSource.ServiceName, OracleSource.SystemIDSID)\n )\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleSource.Unencrypted,\n OracleSource.NativeNetworkEncryptionNNE,\n OracleSource.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class KlaviyoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Klaviyo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo\n\n Args:\n name (str): The name of the destination.\n api_key (str): Klaviyo API Key. See our docs if you need help finding this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Klaviyo", name)
\n\n\n
[docs]class GoogleDirectorySource(GeneratedAirbyteSource):\n
[docs] class SignInViaGoogleOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n credentials_title: Optional[str] = None,\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKey:\n
[docs] @public\n def __init__(\n self, credentials_json: str, email: str, credentials_title: Optional[str] = None\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleDirectorySource.SignInViaGoogleOAuth", "GoogleDirectorySource.ServiceAccountKey"\n ],\n ):\n """Airbyte Source for Google Directory.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey]): Google APIs use the OAuth 2.0 protocol for authentication and authorization. The Source supports Web server application and Service accounts scenarios.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey),\n )\n super().__init__("Google Directory", name)
\n\n\n
[docs]class InstagramSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Instagram.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n access_token (str): The value of the access token generated. See the docs for more information\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Instagram", name)
\n\n\n
[docs]class ShortioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_id: str, secret_key: str, start_date: str):\n """Airbyte Source for Shortio.\n\n Documentation can be found at https://developers.short.io/reference\n\n Args:\n name (str): The name of the destination.\n secret_key (str): Short.io Secret Key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_id = check.str_param(domain_id, "domain_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shortio", name)
\n\n\n
[docs]class SquareSource(GeneratedAirbyteSource):\n
[docs] class OauthAuthentication:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Oauth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.auth_type = "Apikey"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n is_sandbox: bool,\n credentials: Union["SquareSource.OauthAuthentication", "SquareSource.APIKey"],\n start_date: Optional[str] = None,\n include_deleted_objects: Optional[bool] = None,\n ):\n """Airbyte Source for Square.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/square\n\n Args:\n name (str): The name of the destination.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n start_date (Optional[str]): UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.\n include_deleted_objects (Optional[bool]): In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)\n """\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.include_deleted_objects = check.opt_bool_param(\n include_deleted_objects, "include_deleted_objects"\n )\n self.credentials = check.inst_param(\n credentials, "credentials", (SquareSource.OauthAuthentication, SquareSource.APIKey)\n )\n super().__init__("Square", name)
\n\n\n
[docs]class DelightedSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, since: str, api_key: str):\n """Airbyte Source for Delighted.\n\n Args:\n name (str): The name of the destination.\n since (str): The date from which you'd like to replicate the data\n api_key (str): A Delighted API key.\n """\n self.since = check.str_param(since, "since")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Delighted", name)
\n\n\n
[docs]class AmazonSqsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n delete_messages: bool,\n max_batch_size: Optional[int] = None,\n max_wait_time: Optional[int] = None,\n attributes_to_return: Optional[str] = None,\n visibility_timeout: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n ):\n """Airbyte Source for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n delete_messages (bool): If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.\n max_batch_size (Optional[int]): Max amount of messages to get in one batch (10 max)\n max_wait_time (Optional[int]): Max amount of time in seconds to wait for messages in a single poll (20 max)\n attributes_to_return (Optional[str]): Comma separated list of Mesage Attribute names to return\n visibility_timeout (Optional[int]): Modify the Visibility Timeout of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for pulling messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for pulling messages\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.delete_messages = check.bool_param(delete_messages, "delete_messages")\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n self.max_wait_time = check.opt_int_param(max_wait_time, "max_wait_time")\n self.attributes_to_return = check.opt_str_param(\n attributes_to_return, "attributes_to_return"\n )\n self.visibility_timeout = check.opt_int_param(visibility_timeout, "visibility_timeout")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class YoutubeAnalyticsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaOAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(self, name: str, credentials: "YoutubeAnalyticsSource.AuthenticateViaOAuth20"):\n """Airbyte Source for Youtube Analytics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", YoutubeAnalyticsSource.AuthenticateViaOAuth20\n )\n super().__init__("Youtube Analytics", name)
\n\n\n
[docs]class ScaffoldSourcePythonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, fix_me: Optional[str] = None):\n """Airbyte Source for Scaffold Source Python.\n\n Args:\n name (str): The name of the destination.\n fix_me (Optional[str]): describe me\n """\n self.fix_me = check.opt_str_param(fix_me, "fix_me")\n super().__init__("Scaffold Source Python", name)
\n\n\n
[docs]class LookerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n client_id: str,\n client_secret: str,\n run_look_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Looker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/looker\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address\n client_id (str): The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.\n client_secret (str): The Client Secret is second part of an API3 key.\n run_look_ids (Optional[List[str]]): The IDs of any Looks to run\n """\n self.domain = check.str_param(domain, "domain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.run_look_ids = check.opt_nullable_list_param(run_look_ids, "run_look_ids", str)\n super().__init__("Looker", name)
\n\n\n
[docs]class GitlabSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_url: str,\n private_token: str,\n start_date: str,\n groups: Optional[str] = None,\n projects: Optional[str] = None,\n ):\n """Airbyte Source for Gitlab.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab\n\n Args:\n name (str): The name of the destination.\n api_url (str): Please enter your basic URL from GitLab instance.\n private_token (str): Log into your GitLab account and then generate a personal Access Token.\n groups (Optional[str]): Space-delimited list of groups. e.g. airbyte.io.\n projects (Optional[str]): Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.\n start_date (str): The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_url = check.str_param(api_url, "api_url")\n self.private_token = check.str_param(private_token, "private_token")\n self.groups = check.opt_str_param(groups, "groups")\n self.projects = check.opt_str_param(projects, "projects")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gitlab", name)
\n\n\n
[docs]class ExchangeRatesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n access_key: str,\n base: Optional[str] = None,\n ignore_weekends: Optional[bool] = None,\n ):\n """Airbyte Source for Exchange Rates.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start getting data from that date.\n access_key (str): Your API Key. See here. The key is case sensitive.\n base (Optional[str]): ISO reference currency. See here. Free plan doesn't support Source Currency Switching, default base currency is EUR\n ignore_weekends (Optional[bool]): Ignore weekends? (Exchanges don't run on weekends)\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_key = check.str_param(access_key, "access_key")\n self.base = check.opt_str_param(base, "base")\n self.ignore_weekends = check.opt_bool_param(ignore_weekends, "ignore_weekends")\n super().__init__("Exchange Rates", name)
\n\n\n
[docs]class AmazonAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n region: Optional[str] = None,\n report_wait_timeout: Optional[int] = None,\n report_generation_max_retries: Optional[int] = None,\n start_date: Optional[str] = None,\n profiles: Optional[List[int]] = None,\n state_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Amazon Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads\n\n Args:\n name (str): The name of the destination.\n client_id (str): The client ID of your Amazon Ads developer application. See the docs for more information.\n client_secret (str): The client secret of your Amazon Ads developer application. See the docs for more information.\n refresh_token (str): Amazon Ads refresh token. See the docs for more information on how to obtain this token.\n region (Optional[str]): Region to pull data from (EU/NA/FE). See docs for more details.\n report_wait_timeout (Optional[int]): Timeout duration in minutes for Reports. Default is 60 minutes.\n report_generation_max_retries (Optional[int]): Maximum retries Airbyte will attempt for fetching report data. Default is 5.\n start_date (Optional[str]): The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format\n profiles (Optional[List[int]]): Profile IDs you want to fetch data for. See docs for more details.\n state_filter (Optional[List[str]]): Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.region = check.opt_str_param(region, "region")\n self.report_wait_timeout = check.opt_int_param(report_wait_timeout, "report_wait_timeout")\n self.report_generation_max_retries = check.opt_int_param(\n report_generation_max_retries, "report_generation_max_retries"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.profiles = check.opt_nullable_list_param(profiles, "profiles", int)\n self.state_filter = check.opt_nullable_list_param(state_filter, "state_filter", str)\n super().__init__("Amazon Ads", name)
\n\n\n
[docs]class MixpanelSource(GeneratedAirbyteSource):\n
[docs] class ServiceAccount:\n
[docs] @public\n def __init__(self, username: str, secret: str):\n self.username = check.str_param(username, "username")\n self.secret = check.str_param(secret, "secret")
\n\n
[docs] class ProjectSecret:\n
[docs] @public\n def __init__(self, api_secret: str):\n self.api_secret = check.str_param(api_secret, "api_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["MixpanelSource.ServiceAccount", "MixpanelSource.ProjectSecret"],\n project_id: Optional[int] = None,\n attribution_window: Optional[int] = None,\n project_timezone: Optional[str] = None,\n select_properties_by_default: Optional[bool] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n region: Optional[str] = None,\n date_window_size: Optional[int] = None,\n ):\n """Airbyte Source for Mixpanel.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel\n\n Args:\n name (str): The name of the destination.\n credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]): Choose how to authenticate to Mixpanel\n project_id (Optional[int]): Your project ID number. See the docs for more information on how to obtain this.\n attribution_window (Optional[int]): A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.\n project_timezone (Optional[str]): Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.\n select_properties_by_default (Optional[bool]): Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.\n start_date (Optional[str]): The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.\n end_date (Optional[str]): The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date\n region (Optional[str]): The region of mixpanel domain instance either US or EU.\n date_window_size (Optional[int]): Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret),\n )\n self.project_id = check.opt_int_param(project_id, "project_id")\n self.attribution_window = check.opt_int_param(attribution_window, "attribution_window")\n self.project_timezone = check.opt_str_param(project_timezone, "project_timezone")\n self.select_properties_by_default = check.opt_bool_param(\n select_properties_by_default, "select_properties_by_default"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.region = check.opt_str_param(region, "region")\n self.date_window_size = check.opt_int_param(date_window_size, "date_window_size")\n super().__init__("Mixpanel", name)
\n\n\n
[docs]class OrbitSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, workspace: str, start_date: Optional[str] = None):\n """Airbyte Source for Orbit.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit\n\n Args:\n name (str): The name of the destination.\n api_token (str): Authorizes you to work with Orbit workspaces associated with the token.\n workspace (str): The unique name of the workspace that your API token is associated with.\n start_date (Optional[str]): Date in the format 2022-06-26. Only load members whose last activities are after this date.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.workspace = check.str_param(workspace, "workspace")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Orbit", name)
\n\n\n
[docs]class AmazonSellerPartnerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lwa_app_id: str,\n lwa_client_secret: str,\n refresh_token: str,\n aws_access_key: str,\n aws_secret_key: str,\n role_arn: str,\n replication_start_date: str,\n aws_environment: str,\n region: str,\n app_id: Optional[str] = None,\n auth_type: Optional[str] = None,\n replication_end_date: Optional[str] = None,\n period_in_days: Optional[int] = None,\n report_options: Optional[str] = None,\n max_wait_seconds: Optional[int] = None,\n ):\n """Airbyte Source for Amazon Seller Partner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner\n\n Args:\n name (str): The name of the destination.\n app_id (Optional[str]): Your Amazon App ID\n lwa_app_id (str): Your Login with Amazon Client ID.\n lwa_client_secret (str): Your Login with Amazon Client Secret.\n refresh_token (str): The Refresh Token obtained via OAuth flow authorization.\n aws_access_key (str): Specifies the AWS access key used as part of the credentials to authenticate the user.\n aws_secret_key (str): Specifies the AWS secret key used as part of the credentials to authenticate the user.\n role_arn (str): Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to 'Assume Role' STS).\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n replication_end_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.\n period_in_days (Optional[int]): Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.\n report_options (Optional[str]): Additional information passed to reports. This varies by report type. Must be a valid json string.\n max_wait_seconds (Optional[int]): Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.\n aws_environment (str): An enumeration.\n region (str): An enumeration.\n """\n self.app_id = check.opt_str_param(app_id, "app_id")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.lwa_app_id = check.str_param(lwa_app_id, "lwa_app_id")\n self.lwa_client_secret = check.str_param(lwa_client_secret, "lwa_client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.aws_access_key = check.str_param(aws_access_key, "aws_access_key")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.role_arn = check.str_param(role_arn, "role_arn")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.replication_end_date = check.opt_str_param(\n replication_end_date, "replication_end_date"\n )\n self.period_in_days = check.opt_int_param(period_in_days, "period_in_days")\n self.report_options = check.opt_str_param(report_options, "report_options")\n self.max_wait_seconds = check.opt_int_param(max_wait_seconds, "max_wait_seconds")\n self.aws_environment = check.str_param(aws_environment, "aws_environment")\n self.region = check.str_param(region, "region")\n super().__init__("Amazon Seller Partner", name)
\n\n\n
[docs]class CourierSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Courier.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/courier\n\n Args:\n name (str): The name of the destination.\n api_key (str): Courier API Key to retrieve your data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Courier", name)
\n\n\n
[docs]class CloseComSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: Optional[str] = None):\n r"""Airbyte Source for Close Com.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com\n\n Args:\n name (str): The name of the destination.\n api_key (str): Close.com API key (usually starts with 'api\\\\_'; find yours here).\n start_date (Optional[str]): The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Close Com", name)
\n\n\n
[docs]class BingAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n refresh_token: str,\n developer_token: str,\n reports_start_date: str,\n auth_method: Optional[str] = None,\n tenant_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n """Airbyte Source for Bing Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads\n\n Args:\n name (str): The name of the destination.\n tenant_id (Optional[str]): The Tenant ID of your Microsoft Advertising developer application. Set this to "common" unless you know you need a different value.\n client_id (str): The Client ID of your Microsoft Advertising developer application.\n client_secret (Optional[str]): The Client Secret of your Microsoft Advertising developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n developer_token (str): Developer token associated with user. See more info in the docs.\n reports_start_date (str): The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.\n """\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.tenant_id = check.opt_str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.reports_start_date = check.str_param(reports_start_date, "reports_start_date")\n super().__init__("Bing Ads", name)
\n\n\n
[docs]class PrimetricSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, client_id: str, client_secret: str):\n """Airbyte Source for Primetric.\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Primetric developer application. The Client ID is visible here.\n client_secret (str): The Client Secret of your Primetric developer application. You can manage your client's credentials here.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Primetric", name)
\n\n\n
[docs]class PivotalTrackerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Pivotal Tracker.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Pivotal Tracker API token\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Pivotal Tracker", name)
\n\n\n
[docs]class ElasticsearchSource(GeneratedAirbyteSource):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchSource.None_",\n "ElasticsearchSource.ApiKeySecret",\n "ElasticsearchSource.UsernamePassword",\n ],\n ):\n r"""Airbyte Source for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n authenticationMethod (Union[ElasticsearchSource.None\\\\_, ElasticsearchSource.ApiKeySecret, ElasticsearchSource.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchSource.None_,\n ElasticsearchSource.ApiKeySecret,\n ElasticsearchSource.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class BigquerySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, credentials_json: str, dataset_id: Optional[str] = None\n ):\n """Airbyte Source for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n dataset_id (Optional[str]): The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.\n credentials_json (str): The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.opt_str_param(dataset_id, "dataset_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Bigquery", name)
\n\n\n
[docs]class WoocommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n start_date: str,\n api_key: str,\n api_secret: str,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Woocommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of the store. For https://EXAMPLE.com, the shop name is 'EXAMPLE.com'.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n api_key (str): The CUSTOMER KEY for API in WooCommerce shop.\n api_secret (str): The CUSTOMER SECRET for API in WooCommerce shop.\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.\n """\n self.shop = check.str_param(shop, "shop")\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Woocommerce", name)
\n\n\n
[docs]class SearchMetricsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_key: str, client_secret: str, country_code: str, start_date: str\n ):\n """Airbyte Source for Search Metrics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics\n\n Args:\n name (str): The name of the destination.\n country_code (str): The region of the S3 staging bucket to use if utilising a copy strategy.\n start_date (str): Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.country_code = check.str_param(country_code, "country_code")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Search Metrics", name)
\n\n\n
[docs]class TypeformSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, start_date: str, token: str, form_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Typeform.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.\n token (str): The API Token for a Typeform account.\n form_ids (Optional[List[str]]): When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL "https://mysite.typeform.com/to/u6nXL7" the form_id is u6nXL7. You can find form URLs on Share panel\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.token = check.str_param(token, "token")\n self.form_ids = check.opt_nullable_list_param(form_ids, "form_ids", str)\n super().__init__("Typeform", name)
\n\n\n
[docs]class WebflowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, site_id: str, api_key: str):\n """Airbyte Source for Webflow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow\n\n Args:\n name (str): The name of the destination.\n site_id (str): The id of the Webflow site you are requesting data from. See https://developers.webflow.com/#sites\n api_key (str): The API token for authenticating to Webflow. See https://university.webflow.com/lesson/intro-to-the-webflow-api\n """\n self.site_id = check.str_param(site_id, "site_id")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Webflow", name)
\n\n\n
[docs]class FireboltSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Source for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n super().__init__("Firebolt", name)
\n\n\n
[docs]class FaunaSource(GeneratedAirbyteSource):\n
[docs] class Disabled:\n
[docs] @public\n def __init__(\n self,\n ):\n self.deletion_mode = "ignore"
\n\n
[docs] class Enabled:\n
[docs] @public\n def __init__(self, column: str):\n self.deletion_mode = "deleted_field"\n self.column = check.str_param(column, "column")
\n\n
[docs] class Collection:\n
[docs] @public\n def __init__(\n self, page_size: int, deletions: Union["FaunaSource.Disabled", "FaunaSource.Enabled"]\n ):\n self.page_size = check.int_param(page_size, "page_size")\n self.deletions = check.inst_param(\n deletions, "deletions", (FaunaSource.Disabled, FaunaSource.Enabled)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n port: int,\n scheme: str,\n secret: str,\n collection: "FaunaSource.Collection",\n ):\n """Airbyte Source for Fauna.\n\n Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain of Fauna to query. Defaults db.fauna.com. See the docs.\n port (int): Endpoint port.\n scheme (str): URL scheme.\n secret (str): Fauna secret, used when authenticating with the database.\n collection (FaunaSource.Collection): Settings for the Fauna Collection.\n """\n self.domain = check.str_param(domain, "domain")\n self.port = check.int_param(port, "port")\n self.scheme = check.str_param(scheme, "scheme")\n self.secret = check.str_param(secret, "secret")\n self.collection = check.inst_param(collection, "collection", FaunaSource.Collection)\n super().__init__("Fauna", name)
\n\n\n
[docs]class IntercomSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Intercom.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n access_token (str): Access token for making authenticated requests. See the Intercom docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Intercom", name)
\n\n\n
[docs]class FreshsalesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str):\n """Airbyte Source for Freshsales.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The Name of your Freshsales domain\n api_key (str): Freshsales API Key. See here. The key is case sensitive.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Freshsales", name)
\n\n\n
[docs]class AdjustSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n dimensions: List[str],\n ingest_start: str,\n metrics: List[str],\n additional_metrics: Optional[List[str]] = None,\n until_today: Optional[bool] = None,\n ):\n """Airbyte Source for Adjust.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust\n\n Args:\n name (str): The name of the destination.\n additional_metrics (Optional[List[str]]): Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.\n api_token (str): Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication\n dimensions (List[str]): Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.\n ingest_start (str): Data ingest start date.\n metrics (List[str]): Select at least one metric to query.\n until_today (Optional[bool]): Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.\n """\n self.additional_metrics = check.opt_nullable_list_param(\n additional_metrics, "additional_metrics", str\n )\n self.api_token = check.str_param(api_token, "api_token")\n self.dimensions = check.list_param(dimensions, "dimensions", str)\n self.ingest_start = check.str_param(ingest_start, "ingest_start")\n self.metrics = check.list_param(metrics, "metrics", str)\n self.until_today = check.opt_bool_param(until_today, "until_today")\n super().__init__("Adjust", name)
\n\n\n
[docs]class BambooHrSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n api_key: str,\n custom_reports_fields: Optional[str] = None,\n custom_reports_include_default_fields: Optional[bool] = None,\n ):\n """Airbyte Source for Bamboo Hr.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr\n\n Args:\n name (str): The name of the destination.\n subdomain (str): Sub Domain of bamboo hr\n api_key (str): Api key of bamboo hr\n custom_reports_fields (Optional[str]): Comma-separated list of fields to include in custom reports.\n custom_reports_include_default_fields (Optional[bool]): If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.api_key = check.str_param(api_key, "api_key")\n self.custom_reports_fields = check.opt_str_param(\n custom_reports_fields, "custom_reports_fields"\n )\n self.custom_reports_include_default_fields = check.opt_bool_param(\n custom_reports_include_default_fields, "custom_reports_include_default_fields"\n )\n super().__init__("Bamboo Hr", name)
\n\n\n
[docs]class GoogleAdsSource(GeneratedAirbyteSource):\n
[docs] class GoogleCredentials:\n
[docs] @public\n def __init__(\n self,\n developer_token: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class CustomGAQLQueriesEntry:\n
[docs] @public\n def __init__(self, query: str, table_name: str):\n self.query = check.str_param(query, "query")\n self.table_name = check.str_param(table_name, "table_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "GoogleAdsSource.GoogleCredentials",\n customer_id: str,\n start_date: str,\n end_date: Optional[str] = None,\n custom_queries: Optional[List[CustomGAQLQueriesEntry]] = None,\n login_customer_id: Optional[str] = None,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads\n\n Args:\n name (str): The name of the destination.\n customer_id (str): Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n login_customer_id (Optional[str]): If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google's documentation.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleAdsSource.GoogleCredentials\n )\n self.customer_id = check.str_param(customer_id, "customer_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.custom_queries = check.opt_nullable_list_param(\n custom_queries, "custom_queries", GoogleAdsSource.CustomGAQLQueriesEntry\n )\n self.login_customer_id = check.opt_str_param(login_customer_id, "login_customer_id")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Google Ads", name)
\n\n\n
[docs]class HellobatonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, company: str):\n """Airbyte Source for Hellobaton.\n\n Args:\n name (str): The name of the destination.\n api_key (str): authentication key required to access the api endpoints\n company (str): Company name that generates your base api url\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.company = check.str_param(company, "company")\n super().__init__("Hellobaton", name)
\n\n\n
[docs]class SendgridSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, apikey: str, start_time: Union[int, str]):\n """Airbyte Source for Sendgrid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid\n\n Args:\n name (str): The name of the destination.\n apikey (str): API Key, use admin to generate this key.\n start_time (Union[int, str]): Start time in ISO8601 format. Any data before this time point will not be replicated.\n """\n self.apikey = check.str_param(apikey, "apikey")\n self.start_time = check.inst_param(start_time, "start_time", (int, str))\n super().__init__("Sendgrid", name)
\n\n\n
[docs]class MondaySource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n subdomain: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MondaySource.OAuth20", "MondaySource.APIToken"]\n ):\n """Airbyte Source for Monday.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/monday\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MondaySource.OAuth20, MondaySource.APIToken)\n )\n super().__init__("Monday", name)
\n\n\n
[docs]class DixaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_token: str, start_date: str, batch_size: Optional[int] = None\n ):\n """Airbyte Source for Dixa.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa\n\n Args:\n name (str): The name of the destination.\n api_token (str): Dixa API token\n start_date (str): The connector pulls records updated from this date onwards.\n batch_size (Optional[int]): Number of days to batch into one request. Max 31.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n super().__init__("Dixa", name)
\n\n\n
[docs]class SalesforceSource(GeneratedAirbyteSource):\n
[docs] class FilterSalesforceObjectsEntry:\n
[docs] @public\n def __init__(self, criteria: str, value: str):\n self.criteria = check.str_param(criteria, "criteria")\n self.value = check.str_param(value, "value")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n is_sandbox: Optional[bool] = None,\n auth_type: Optional[str] = None,\n start_date: Optional[str] = None,\n streams_criteria: Optional[List[FilterSalesforceObjectsEntry]] = None,\n ):\n """Airbyte Source for Salesforce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce\n\n Args:\n name (str): The name of the destination.\n is_sandbox (Optional[bool]): Toggle if you're using a Salesforce Sandbox\n client_id (str): Enter your Salesforce developer application's Client ID\n client_secret (str): Enter your Salesforce developer application's Client secret\n refresh_token (str): Enter your application's Salesforce Refresh Token used for Airbyte to access your Salesforce account.\n start_date (Optional[str]): Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.\n streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]): Filter streams relevant to you\n """\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.streams_criteria = check.opt_nullable_list_param(\n streams_criteria, "streams_criteria", SalesforceSource.FilterSalesforceObjectsEntry\n )\n super().__init__("Salesforce", name)
\n\n\n
[docs]class PipedriveSource(GeneratedAirbyteSource):\n
[docs] class SignInViaPipedriveOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKeyAuthentication:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "Token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n authorization: Union[\n "PipedriveSource.SignInViaPipedriveOAuth", "PipedriveSource.APIKeyAuthentication"\n ],\n replication_start_date: str,\n ):\n """Airbyte Source for Pipedrive.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive\n\n Args:\n name (str): The name of the destination.\n authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]): Choose one of the possible authorization method\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental\n """\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication),\n )\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n super().__init__("Pipedrive", name)
\n\n\n
[docs]class FileSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class LocalFilesystemLimited:\n
[docs] @public\n def __init__(\n self,\n ):\n self.storage = "local"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSource.HTTPSPublicWeb",\n "FileSource.GCSGoogleCloudStorage",\n "FileSource.S3AmazonWebServices",\n "FileSource.AzBlobAzureBlobStorage",\n "FileSource.SSHSecureShell",\n "FileSource.SCPSecureCopyProtocol",\n "FileSource.SFTPSecureFileTransferProtocol",\n "FileSource.LocalFilesystemLimited",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSource.HTTPSPublicWeb, FileSource.GCSGoogleCloudStorage, FileSource.S3AmazonWebServices, FileSource.AzBlobAzureBlobStorage, FileSource.SSHSecureShell, FileSource.SCPSecureCopyProtocol, FileSource.SFTPSecureFileTransferProtocol, FileSource.LocalFilesystemLimited]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSource.HTTPSPublicWeb,\n FileSource.GCSGoogleCloudStorage,\n FileSource.S3AmazonWebServices,\n FileSource.AzBlobAzureBlobStorage,\n FileSource.SSHSecureShell,\n FileSource.SCPSecureCopyProtocol,\n FileSource.SFTPSecureFileTransferProtocol,\n FileSource.LocalFilesystemLimited,\n ),\n )\n super().__init__("File", name)
\n\n\n
[docs]class GlassfrogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Glassfrog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog\n\n Args:\n name (str): The name of the destination.\n api_key (str): API key provided by Glassfrog\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Glassfrog", name)
\n\n\n
[docs]class ChartmogulSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str, interval: str):\n """Airbyte Source for Chartmogul.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chartmogul API key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.\n interval (str): Some APIs such as Metrics require intervals to cluster data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.interval = check.str_param(interval, "interval")\n super().__init__("Chartmogul", name)
\n\n\n
[docs]class OrbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n start_date: Optional[str] = None,\n lookback_window_days: Optional[int] = None,\n string_event_properties_keys: Optional[List[str]] = None,\n numeric_event_properties_keys: Optional[List[str]] = None,\n ):\n """Airbyte Source for Orb.\n\n Documentation can be found at https://docs.withorb.com/\n\n Args:\n name (str): The name of the destination.\n api_key (str): Orb API Key, issued from the Orb admin console.\n start_date (Optional[str]): UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.\n lookback_window_days (Optional[int]): When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.\n string_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n numeric_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.string_event_properties_keys = check.opt_nullable_list_param(\n string_event_properties_keys, "string_event_properties_keys", str\n )\n self.numeric_event_properties_keys = check.opt_nullable_list_param(\n numeric_event_properties_keys, "numeric_event_properties_keys", str\n )\n super().__init__("Orb", name)
\n\n\n
[docs]class CockroachdbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Cockroachdb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt client/server communications for increased security.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Cockroachdb", name)
\n\n\n
[docs]class ConfluenceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, domain_name: str, email: str):\n """Airbyte Source for Confluence.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Please follow the Jira confluence for generating an API token: https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/\n domain_name (str): Your Confluence domain name\n email (str): Your Confluence login email\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.email = check.str_param(email, "email")\n super().__init__("Confluence", name)
\n\n\n
[docs]class PlaidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n api_key: str,\n client_id: str,\n plaid_env: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Plaid.\n\n Documentation can be found at https://plaid.com/docs/api/\n\n Args:\n name (str): The name of the destination.\n access_token (str): The end-user's Link access token.\n api_key (str): The Plaid API key to use to hit the API.\n client_id (str): The Plaid client id\n plaid_env (str): The Plaid environment\n start_date (Optional[str]): The date from which you'd like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.api_key = check.str_param(api_key, "api_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.plaid_env = check.str_param(plaid_env, "plaid_env")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Plaid", name)
\n\n\n
[docs]class SnapchatMarketingSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Snapchat Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Snapchat developer application.\n client_secret (str): The Client Secret of your Snapchat developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n start_date (Optional[str]): Date in the format 2022-01-01. Any data before this date will not be replicated.\n end_date (Optional[str]): Date in the format 2017-01-25. Any data after this date will not be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Snapchat Marketing", name)
\n\n\n
[docs]class MicrosoftTeamsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaMicrosoftOAuth20:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateViaMicrosoft:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n period: str,\n credentials: Union[\n "MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20",\n "MicrosoftTeamsSource.AuthenticateViaMicrosoft",\n ],\n ):\n """Airbyte Source for Microsoft Teams.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams\n\n Args:\n name (str): The name of the destination.\n period (str): Specifies the length of time over which the Team Device Report stream is aggregated. The supported values are: D7, D30, D90, and D180.\n credentials (Union[MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20, MicrosoftTeamsSource.AuthenticateViaMicrosoft]): Choose how to authenticate to Microsoft\n """\n self.period = check.str_param(period, "period")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20,\n MicrosoftTeamsSource.AuthenticateViaMicrosoft,\n ),\n )\n super().__init__("Microsoft Teams", name)
\n\n\n
[docs]class LeverHiringSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "LeverHiringSource.OAuthCredentials",\n start_date: str,\n environment: Optional[str] = None,\n ):\n """Airbyte Source for Lever Hiring.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring\n\n Args:\n name (str): The name of the destination.\n credentials (LeverHiringSource.OAuthCredentials): Choose how to authenticate to Lever Hiring.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.\n environment (Optional[str]): The environment in which you'd like to replicate data for Lever. This is used to determine which Lever API endpoint to use.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", LeverHiringSource.OAuthCredentials\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.environment = check.opt_str_param(environment, "environment")\n super().__init__("Lever Hiring", name)
\n\n\n
[docs]class TwilioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_sid: str,\n auth_token: str,\n start_date: str,\n lookback_window: Optional[int] = None,\n ):\n """Airbyte Source for Twilio.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio\n\n Args:\n name (str): The name of the destination.\n account_sid (str): Twilio account SID\n auth_token (str): Twilio Auth Token.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (Optional[int]): How far into the past to look for records. (in minutes)\n """\n self.account_sid = check.str_param(account_sid, "account_sid")\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.opt_int_param(lookback_window, "lookback_window")\n super().__init__("Twilio", name)
\n\n\n
[docs]class StripeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n client_secret: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n slice_range: Optional[int] = None,\n ):\n r"""Airbyte Source for Stripe.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe\n\n Args:\n name (str): The name of the destination.\n account_id (str): Your Stripe account ID (starts with 'acct\\\\_', find yours here).\n client_secret (str): Stripe API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here\n slice_range (Optional[int]): The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.slice_range = check.opt_int_param(slice_range, "slice_range")\n super().__init__("Stripe", name)
\n\n\n
[docs]class Db2Source(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str, key_store_password: Optional[str] = None):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")\n self.key_store_password = check.opt_str_param(key_store_password, "key_store_password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n db: str,\n username: str,\n password: str,\n encryption: Union["Db2Source.Unencrypted", "Db2Source.TLSEncryptedVerifyCertificate"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Db2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/db2\n\n Args:\n name (str): The name of the destination.\n host (str): Host of the Db2.\n port (int): Port of the database.\n db (str): Name of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]): Encryption method to use when communicating with the database\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.db = check.str_param(db, "db")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate),\n )\n super().__init__("Db2", name)
\n\n\n
[docs]class SlackSource(GeneratedAirbyteSource):\n
[docs] class DefaultOAuth20Authorization:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: Optional[str] = None,\n ):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class APITokenCredentials:\n
[docs] @public\n def __init__(self, api_token: str):\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n lookback_window: int,\n join_channels: bool,\n credentials: Union[\n "SlackSource.DefaultOAuth20Authorization", "SlackSource.APITokenCredentials"\n ],\n channel_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Slack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/slack\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (int): How far into the past to look for messages in threads.\n join_channels (bool): Whether to join all channels or to sync data only from channels the bot is already in. If false, you'll need to manually add the bot to all the channels from which you'd like to sync messages.\n channel_filter (Optional[List[str]]): A channel name list (without leading '#' char) which limit the channels from which you'd like to sync. Empty list means no filter.\n credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]): Choose how to authenticate into Slack\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.int_param(lookback_window, "lookback_window")\n self.join_channels = check.bool_param(join_channels, "join_channels")\n self.channel_filter = check.opt_nullable_list_param(channel_filter, "channel_filter", str)\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials),\n )\n super().__init__("Slack", name)
\n\n\n
[docs]class RechargeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Recharge.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.\n access_token (str): The value of the Access Token generated. See the docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Recharge", name)
\n\n\n
[docs]class OpenweatherSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lat: str,\n lon: str,\n appid: str,\n units: Optional[str] = None,\n lang: Optional[str] = None,\n ):\n """Airbyte Source for Openweather.\n\n Args:\n name (str): The name of the destination.\n lat (str): Latitude for which you want to get weather condition from. (min -90, max 90)\n lon (str): Longitude for which you want to get weather condition from. (min -180, max 180)\n appid (str): Your OpenWeather API Key. See here. The key is case sensitive.\n units (Optional[str]): Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.\n lang (Optional[str]): You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.\n """\n self.lat = check.str_param(lat, "lat")\n self.lon = check.str_param(lon, "lon")\n self.appid = check.str_param(appid, "appid")\n self.units = check.opt_str_param(units, "units")\n self.lang = check.opt_str_param(lang, "lang")\n super().__init__("Openweather", name)
\n\n\n
[docs]class RetentlySource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaRetentlyOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithAPIToken:\n
[docs] @public\n def __init__(self, api_key: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "RetentlySource.AuthenticateViaRetentlyOAuth", "RetentlySource.AuthenticateWithAPIToken"\n ],\n ):\n """Airbyte Source for Retently.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken]): Choose how to authenticate to Retently\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken),\n )\n super().__init__("Retently", name)
\n\n\n
[docs]class ScaffoldSourceHttpSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, TODO: str):\n """Airbyte Source for Scaffold Source Http.\n\n Args:\n name (str): The name of the destination.\n TODO (str): describe me\n """\n self.TODO = check.str_param(TODO, "TODO")\n super().__init__("Scaffold Source Http", name)
\n\n\n
[docs]class YandexMetricaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, auth_token: str, counter_id: str, start_date: str, end_date: str):\n """Airbyte Source for Yandex Metrica.\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Your Yandex Metrica API access token\n counter_id (str): Counter ID\n start_date (str): UTC date and time in the format YYYY-MM-DD.\n end_date (str): UTC date and time in the format YYYY-MM-DD.\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.counter_id = check.str_param(counter_id, "counter_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.str_param(end_date, "end_date")\n super().__init__("Yandex Metrica", name)
\n\n\n
[docs]class TalkdeskExploreSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n auth_url: str,\n api_key: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Talkdesk Explore.\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.\n timezone (Optional[str]): Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)\n auth_url (str): Talkdesk Auth URL. Only 'client_credentials' auth type supported at the moment.\n api_key (str): Talkdesk API key.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n self.auth_url = check.str_param(auth_url, "auth_url")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Talkdesk Explore", name)
\n\n\n
[docs]class ChargifySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, domain: str):\n """Airbyte Source for Chargify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chargify API Key.\n domain (str): Chargify domain. Normally this domain follows the following format companyname.chargify.com\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.domain = check.str_param(domain, "domain")\n super().__init__("Chargify", name)
\n\n\n
[docs]class RkiCovidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str):\n """Airbyte Source for Rki Covid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n """\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Rki Covid", name)
\n\n\n
[docs]class PostgresSource(GeneratedAirbyteSource):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n replication_slot: str,\n publication: str,\n plugin: Optional[str] = None,\n initial_waiting_seconds: Optional[int] = None,\n ):\n self.method = "CDC"\n self.plugin = check.opt_str_param(plugin, "plugin")\n self.replication_slot = check.str_param(replication_slot, "replication_slot")\n self.publication = check.str_param(publication, "publication")\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )
\n\n
[docs] class NoTunnel:\n
[docs] @public\n def __init__(\n self,\n ):\n self.tunnel_method = "NO_TUNNEL"
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, tunnel_host: str, tunnel_port: int, tunnel_user: str, ssh_key: str):\n self.tunnel_method = "SSH_KEY_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.ssh_key = check.str_param(ssh_key, "ssh_key")
\n\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(\n self, tunnel_host: str, tunnel_port: int, tunnel_user: str, tunnel_user_password: str\n ):\n self.tunnel_method = "SSH_PASSWORD_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.tunnel_user_password = check.str_param(\n tunnel_user_password, "tunnel_user_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "PostgresSource.Disable",\n "PostgresSource.Allow",\n "PostgresSource.Prefer",\n "PostgresSource.Require",\n "PostgresSource.VerifyCa",\n "PostgresSource.VerifyFull",\n ],\n replication_method: Union[\n "PostgresSource.Standard", "PostgresSource.LogicalReplicationCDC"\n ],\n tunnel_method: Union[\n "PostgresSource.NoTunnel",\n "PostgresSource.SSHKeyAuthentication",\n "PostgresSource.PasswordAuthentication",\n ],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas (case sensitive) to sync from. Defaults to public.\n username (str): Username to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]): SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.\n replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]): Replication method for extracting data from the database.\n tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]): Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresSource.Disable,\n PostgresSource.Allow,\n PostgresSource.Prefer,\n PostgresSource.Require,\n PostgresSource.VerifyCa,\n PostgresSource.VerifyFull,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (PostgresSource.Standard, PostgresSource.LogicalReplicationCDC),\n )\n self.tunnel_method = check.inst_param(\n tunnel_method,\n "tunnel_method",\n (\n PostgresSource.NoTunnel,\n PostgresSource.SSHKeyAuthentication,\n PostgresSource.PasswordAuthentication,\n ),\n )\n super().__init__("Postgres", name)
\n\n\n
[docs]class TrelloSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n board_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Trello.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/trello\n\n Args:\n name (str): The name of the destination.\n token (str): Trello v API token. See the docs for instructions on how to generate it.\n key (str): Trello API key. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n board_ids (Optional[List[str]]): IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.board_ids = check.opt_nullable_list_param(board_ids, "board_ids", str)\n super().__init__("Trello", name)
\n\n\n
[docs]class PrestashopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, url: str, access_key: str):\n """Airbyte Source for Prestashop.\n\n Args:\n name (str): The name of the destination.\n url (str): Shop URL without trailing slash (domain name or IP address)\n access_key (str): Your PrestaShop access key. See the docs for info on how to obtain this.\n """\n self.url = check.str_param(url, "url")\n self.access_key = check.str_param(access_key, "access_key")\n super().__init__("Prestashop", name)
\n\n\n
[docs]class PaystackSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n secret_key: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n ):\n r"""Airbyte Source for Paystack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack\n\n Args:\n name (str): The name of the destination.\n secret_key (str): The Paystack API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.\n """\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n super().__init__("Paystack", name)
\n\n\n
[docs]class S3Source(GeneratedAirbyteSource):\n
[docs] class CSV:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n delimiter: Optional[str] = None,\n infer_datatypes: Optional[bool] = None,\n quote_char: Optional[str] = None,\n escape_char: Optional[str] = None,\n encoding: Optional[str] = None,\n double_quote: Optional[bool] = None,\n newlines_in_values: Optional[bool] = None,\n additional_reader_options: Optional[str] = None,\n advanced_options: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.delimiter = check.opt_str_param(delimiter, "delimiter")\n self.infer_datatypes = check.opt_bool_param(infer_datatypes, "infer_datatypes")\n self.quote_char = check.opt_str_param(quote_char, "quote_char")\n self.escape_char = check.opt_str_param(escape_char, "escape_char")\n self.encoding = check.opt_str_param(encoding, "encoding")\n self.double_quote = check.opt_bool_param(double_quote, "double_quote")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.additional_reader_options = check.opt_str_param(\n additional_reader_options, "additional_reader_options"\n )\n self.advanced_options = check.opt_str_param(advanced_options, "advanced_options")\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class Parquet:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n columns: Optional[List[str]] = None,\n batch_size: Optional[int] = None,\n buffer_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.columns = check.opt_nullable_list_param(columns, "columns", str)\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n self.buffer_size = check.opt_int_param(buffer_size, "buffer_size")
\n\n
[docs] class Avro:\n
[docs] @public\n def __init__(self, filetype: Optional[str] = None):\n self.filetype = check.opt_str_param(filetype, "filetype")
\n\n
[docs] class Jsonl:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n newlines_in_values: Optional[bool] = None,\n unexpected_field_behavior: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.unexpected_field_behavior = check.opt_str_param(\n unexpected_field_behavior, "unexpected_field_behavior"\n )\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n bucket: str,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n path_prefix: Optional[str] = None,\n endpoint: Optional[str] = None,\n ):\n self.bucket = check.str_param(bucket, "bucket")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n self.path_prefix = check.opt_str_param(path_prefix, "path_prefix")\n self.endpoint = check.opt_str_param(endpoint, "endpoint")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset: str,\n path_pattern: str,\n format: Union["S3Source.CSV", "S3Source.Parquet", "S3Source.Avro", "S3Source.Jsonl"],\n provider: "S3Source.S3AmazonWebServices",\n schema: Optional[str] = None,\n ):\n """Airbyte Source for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/s3\n\n Args:\n name (str): The name of the destination.\n dataset (str): The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.\n path_pattern (str): A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.\n format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]): The format of the files you'd like to replicate\n schema (Optional[str]): Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { "column" : "type" }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.\n provider (S3Source.S3AmazonWebServices): Use this to load files from S3 or S3-compatible services\n """\n self.dataset = check.str_param(dataset, "dataset")\n self.path_pattern = check.str_param(path_pattern, "path_pattern")\n self.format = check.inst_param(\n format, "format", (S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl)\n )\n self.schema = check.opt_str_param(schema, "schema")\n self.provider = check.inst_param(provider, "provider", S3Source.S3AmazonWebServices)\n super().__init__("S3", name)
\n\n\n
[docs]class SnowflakeSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.auth_type = "OAuth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.auth_type = "username/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["SnowflakeSource.OAuth20", "SnowflakeSource.UsernameAndPassword"],\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).\n role (str): The role you created for Airbyte to access Snowflake.\n warehouse (str): The warehouse you created for Airbyte to access data.\n database (str): The database you created for Airbyte to access data.\n schema (str): The source Snowflake schema tables.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SnowflakeSource.OAuth20, SnowflakeSource.UsernameAndPassword),\n )\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Snowflake", name)
\n\n\n
[docs]class AmplitudeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, secret_key: str, start_date: str):\n """Airbyte Source for Amplitude.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude\n\n Args:\n name (str): The name of the destination.\n api_key (str): Amplitude API Key. See the setup guide for more information on how to obtain this key.\n secret_key (str): Amplitude Secret Key. See the setup guide for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Amplitude", name)
\n\n\n
[docs]class PosthogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, api_key: str, base_url: Optional[str] = None):\n """Airbyte Source for Posthog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data. Any data before this date will not be replicated.\n api_key (str): API Key. See the docs for information on how to generate this key.\n base_url (Optional[str]): Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.base_url = check.opt_str_param(base_url, "base_url")\n super().__init__("Posthog", name)
\n\n\n
[docs]class PaypalTransactionSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n is_sandbox: bool,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n """Airbyte Source for Paypal Transaction.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions\n\n Args:\n name (str): The name of the destination.\n client_id (Optional[str]): The Client ID of your Paypal developer application.\n client_secret (Optional[str]): The Client Secret of your Paypal developer application.\n refresh_token (Optional[str]): The key to refresh the expired access token.\n start_date (str): Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n super().__init__("Paypal Transaction", name)
\n\n\n
[docs]class MssqlSource(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self, data_to_sync: Optional[str] = None, snapshot_isolation: Optional[str] = None\n ):\n self.method = "CDC"\n self.data_to_sync = check.opt_str_param(data_to_sync, "data_to_sync")\n self.snapshot_isolation = check.opt_str_param(snapshot_isolation, "snapshot_isolation")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_method: Union[\n "MssqlSource.Unencrypted",\n "MssqlSource.EncryptedTrustServerCertificate",\n "MssqlSource.EncryptedVerifyCertificate",\n ],\n replication_method: Union["MssqlSource.Standard", "MssqlSource.LogicalReplicationCDC"],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n database (str): The name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]): The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlSource.Unencrypted,\n MssqlSource.EncryptedTrustServerCertificate,\n MssqlSource.EncryptedVerifyCertificate,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MssqlSource.Standard, MssqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class ZohoCrmSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n dc_region: str,\n environment: str,\n edition: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Zoho Crm.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm\n\n Args:\n name (str): The name of the destination.\n client_id (str): OAuth2.0 Client ID\n client_secret (str): OAuth2.0 Client Secret\n refresh_token (str): OAuth2.0 Refresh Token\n dc_region (str): Please choose the region of your Data Center location. More info by this Link\n environment (str): Please choose the environment\n start_datetime (Optional[str]): ISO 8601, for instance: `YYYY-MM-DD`, `YYYY-MM-DD HH:MM:SS+HH:MM`\n edition (str): Choose your Edition of Zoho CRM to determine API Concurrency Limits\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.dc_region = check.str_param(dc_region, "dc_region")\n self.environment = check.str_param(environment, "environment")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n self.edition = check.str_param(edition, "edition")\n super().__init__("Zoho Crm", name)
\n\n\n
[docs]class RedshiftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: str,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Redshift", name)
\n\n\n
[docs]class AsanaSource(GeneratedAirbyteSource):\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["AsanaSource.PATCredentials", "AsanaSource.OAuthCredentials"],\n ):\n """Airbyte Source for Asana.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[AsanaSource.PATCredentials, AsanaSource.OAuthCredentials]): Choose how to authenticate to Github\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (AsanaSource.PATCredentials, AsanaSource.OAuthCredentials)\n )\n super().__init__("Asana", name)
\n\n\n
[docs]class SmartsheetsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n spreadsheet_id: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Smartsheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets\n\n Args:\n name (str): The name of the destination.\n access_token (str): The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you'd like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.\n spreadsheet_id (str): The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties\n start_datetime (Optional[str]): Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: `2000-01-01T13:00:00`\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n super().__init__("Smartsheets", name)
\n\n\n
[docs]class MailchimpSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, apikey: str):\n self.auth_type = "apikey"\n self.apikey = check.str_param(apikey, "apikey")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MailchimpSource.OAuth20", "MailchimpSource.APIKey"]\n ):\n """Airbyte Source for Mailchimp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MailchimpSource.OAuth20, MailchimpSource.APIKey)\n )\n super().__init__("Mailchimp", name)
\n\n\n
[docs]class SentrySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n auth_token: str,\n organization: str,\n project: str,\n hostname: Optional[str] = None,\n discover_fields: Optional[List[str]] = None,\n ):\n """Airbyte Source for Sentry.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting "{instance_url_prefix}/settings/account/api/auth-tokens/"\n hostname (Optional[str]): Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.\n organization (str): The slug of the organization the groups belong to.\n project (str): The name (slug) of the Project you want to sync.\n discover_fields (Optional[List[str]]): Fields to retrieve when fetching discover events\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.hostname = check.opt_str_param(hostname, "hostname")\n self.organization = check.str_param(organization, "organization")\n self.project = check.str_param(project, "project")\n self.discover_fields = check.opt_nullable_list_param(\n discover_fields, "discover_fields", str\n )\n super().__init__("Sentry", name)
\n\n\n
[docs]class MailgunSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n private_key: str,\n domain_region: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Mailgun.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun\n\n Args:\n name (str): The name of the destination.\n private_key (str): Primary account API key to access your Mailgun data.\n domain_region (Optional[str]): Domain region code. 'EU' or 'US' are possible values. The default is 'US'.\n start_date (Optional[str]): UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.\n """\n self.private_key = check.str_param(private_key, "private_key")\n self.domain_region = check.opt_str_param(domain_region, "domain_region")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Mailgun", name)
\n\n\n
[docs]class OnesignalSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, user_auth_key: str, start_date: str, outcome_names: str):\n """Airbyte Source for Onesignal.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal\n\n Args:\n name (str): The name of the destination.\n user_auth_key (str): OneSignal User Auth Key, see the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n outcome_names (str): Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details\n """\n self.user_auth_key = check.str_param(user_auth_key, "user_auth_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.outcome_names = check.str_param(outcome_names, "outcome_names")\n super().__init__("Onesignal", name)
\n\n\n
[docs]class PythonHttpTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, base: str, access_key: Optional[str] = None):\n """Airbyte Source for Python Http Tutorial.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n access_key (Optional[str]): API access key used to retrieve data from the Exchange Rates API.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n base (str): ISO reference currency. See here.\n """\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.base = check.str_param(base, "base")\n super().__init__("Python Http Tutorial", name)
\n\n\n
[docs]class AirtableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, base_id: str, tables: List[str]):\n """Airbyte Source for Airtable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable\n\n Args:\n name (str): The name of the destination.\n api_key (str): The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.\n base_id (str): The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.\n tables (List[str]): The list of Tables to integrate.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.base_id = check.str_param(base_id, "base_id")\n self.tables = check.list_param(tables, "tables", str)\n super().__init__("Airtable", name)
\n\n\n
[docs]class MongodbV2Source(GeneratedAirbyteSource):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbV2Source.StandaloneMongoDbInstance",\n "MongodbV2Source.ReplicaSet",\n "MongodbV2Source.MongoDBAtlas",\n ],\n database: str,\n user: Optional[str] = None,\n password: Optional[str] = None,\n auth_source: Optional[str] = None,\n ):\n """Airbyte Source for Mongodb V2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]): The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): The database you want to replicate.\n user (Optional[str]): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n auth_source (Optional[str]): The authentication source where the user information is stored.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbV2Source.StandaloneMongoDbInstance,\n MongodbV2Source.ReplicaSet,\n MongodbV2Source.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.user = check.opt_str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.auth_source = check.opt_str_param(auth_source, "auth_source")\n super().__init__("Mongodb V2", name)
\n\n\n
[docs]class FileSecureSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSecureSource.HTTPSPublicWeb",\n "FileSecureSource.GCSGoogleCloudStorage",\n "FileSecureSource.S3AmazonWebServices",\n "FileSecureSource.AzBlobAzureBlobStorage",\n "FileSecureSource.SSHSecureShell",\n "FileSecureSource.SCPSecureCopyProtocol",\n "FileSecureSource.SFTPSecureFileTransferProtocol",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File Secure.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSecureSource.HTTPSPublicWeb, FileSecureSource.GCSGoogleCloudStorage, FileSecureSource.S3AmazonWebServices, FileSecureSource.AzBlobAzureBlobStorage, FileSecureSource.SSHSecureShell, FileSecureSource.SCPSecureCopyProtocol, FileSecureSource.SFTPSecureFileTransferProtocol]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSecureSource.HTTPSPublicWeb,\n FileSecureSource.GCSGoogleCloudStorage,\n FileSecureSource.S3AmazonWebServices,\n FileSecureSource.AzBlobAzureBlobStorage,\n FileSecureSource.SSHSecureShell,\n FileSecureSource.SCPSecureCopyProtocol,\n FileSecureSource.SFTPSecureFileTransferProtocol,\n ),\n )\n super().__init__("File Secure", name)
\n\n\n
[docs]class ZendeskSupportSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n subdomain: str,\n credentials: Union["ZendeskSupportSource.OAuth20", "ZendeskSupportSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Support.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken),\n )\n super().__init__("Zendesk Support", name)
\n\n\n
[docs]class TempoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Tempo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/\n\n Args:\n name (str): The name of the destination.\n api_token (str): Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Tempo", name)
\n\n\n
[docs]class BraintreeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n merchant_id: str,\n public_key: str,\n private_key: str,\n environment: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Braintree.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree\n\n Args:\n name (str): The name of the destination.\n merchant_id (str): The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.\n public_key (str): Braintree Public Key. See the docs for more information on how to obtain this key.\n private_key (str): Braintree Private Key. See the docs for more information on how to obtain this key.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n environment (str): Environment specifies where the data will come from.\n """\n self.merchant_id = check.str_param(merchant_id, "merchant_id")\n self.public_key = check.str_param(public_key, "public_key")\n self.private_key = check.str_param(private_key, "private_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.environment = check.str_param(environment, "environment")\n super().__init__("Braintree", name)
\n\n\n
[docs]class SalesloftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, client_id: str, client_secret: str, refresh_token: str, start_date: str\n ):\n """Airbyte Source for Salesloft.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Salesloft developer application.\n client_secret (str): The Client Secret of your Salesloft developer application.\n refresh_token (str): The token for obtaining a new access token.\n start_date (str): The date from which you'd like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Salesloft", name)
\n\n\n
[docs]class LinnworksSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, application_id: str, application_secret: str, token: str, start_date: str\n ):\n """Airbyte Source for Linnworks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks\n\n Args:\n name (str): The name of the destination.\n application_id (str): Linnworks Application ID\n application_secret (str): Linnworks Application Secret\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.application_id = check.str_param(application_id, "application_id")\n self.application_secret = check.str_param(application_secret, "application_secret")\n self.token = check.str_param(token, "token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Linnworks", name)
\n\n\n
[docs]class ChargebeeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, site: str, site_api_key: str, start_date: str, product_catalog: str\n ):\n """Airbyte Source for Chargebee.\n\n Documentation can be found at https://apidocs.chargebee.com/docs/api\n\n Args:\n name (str): The name of the destination.\n site (str): The site prefix for your Chargebee instance.\n site_api_key (str): Chargebee API Key. See the docs for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n product_catalog (str): Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under `API Version` section.\n """\n self.site = check.str_param(site, "site")\n self.site_api_key = check.str_param(site_api_key, "site_api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.product_catalog = check.str_param(product_catalog, "product_catalog")\n super().__init__("Chargebee", name)
\n\n\n
[docs]class GoogleAnalyticsDataApiSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n property_id: str,\n credentials: Union[\n "GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication",\n ],\n date_ranges_start_date: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics Data Api.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4\n\n Args:\n name (str): The name of the destination.\n property_id (str): A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body\n credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]): Credentials for the service\n date_ranges_start_date (str): The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.property_id = check.str_param(property_id, "property_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth,\n GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.date_ranges_start_date = check.str_param(\n date_ranges_start_date, "date_ranges_start_date"\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics Data Api", name)
\n\n\n
[docs]class OutreachSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n redirect_uri: str,\n start_date: str,\n ):\n """Airbyte Source for Outreach.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Outreach developer application.\n client_secret (str): The Client Secret of your Outreach developer application.\n refresh_token (str): The token for obtaining the new access token.\n redirect_uri (str): A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.\n start_date (str): The date from which you'd like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.redirect_uri = check.str_param(redirect_uri, "redirect_uri")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Outreach", name)
\n\n\n
[docs]class LemlistSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Lemlist.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist\n\n Args:\n name (str): The name of the destination.\n api_key (str): Lemlist API key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Lemlist", name)
\n\n\n
[docs]class ApifyDatasetSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, datasetId: str, clean: Optional[bool] = None):\n """Airbyte Source for Apify Dataset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset\n\n Args:\n name (str): The name of the destination.\n datasetId (str): ID of the dataset you would like to load to Airbyte.\n clean (Optional[bool]): If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.\n """\n self.datasetId = check.str_param(datasetId, "datasetId")\n self.clean = check.opt_bool_param(clean, "clean")\n super().__init__("Apify Dataset", name)
\n\n\n
[docs]class RecurlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n begin_time: Optional[str] = None,\n end_time: Optional[str] = None,\n ):\n """Airbyte Source for Recurly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly\n\n Args:\n name (str): The name of the destination.\n api_key (str): Recurly API Key. See the docs for more information on how to generate this key.\n begin_time (Optional[str]): ISO8601 timestamp from which the replication from Recurly API will start from.\n end_time (Optional[str]): ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won't be imported.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.begin_time = check.opt_str_param(begin_time, "begin_time")\n self.end_time = check.opt_str_param(end_time, "end_time")\n super().__init__("Recurly", name)
\n\n\n
[docs]class ZendeskTalkSource(GeneratedAirbyteSource):\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n credentials: Union["ZendeskTalkSource.APIToken", "ZendeskTalkSource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Zendesk Talk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk\n\n Args:\n name (str): The name of the destination.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n start_date (str): The date from which you'd like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Zendesk Talk", name)
\n\n\n
[docs]class SftpSource(GeneratedAirbyteSource):\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(self, auth_user_password: str):\n self.auth_method = "SSH_PASSWORD_AUTH"\n self.auth_user_password = check.str_param(auth_user_password, "auth_user_password")
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, auth_ssh_key: str):\n self.auth_method = "SSH_KEY_AUTH"\n self.auth_ssh_key = check.str_param(auth_ssh_key, "auth_ssh_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n user: str,\n host: str,\n port: int,\n credentials: Union["SftpSource.PasswordAuthentication", "SftpSource.SSHKeyAuthentication"],\n file_types: Optional[str] = None,\n folder_path: Optional[str] = None,\n file_pattern: Optional[str] = None,\n ):\n """Airbyte Source for Sftp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/sftp\n\n Args:\n name (str): The name of the destination.\n user (str): The server user\n host (str): The server host address\n port (int): The server port\n credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]): The server authentication method\n file_types (Optional[str]): Coma separated file types. Currently only 'csv' and 'json' types are supported.\n folder_path (Optional[str]): The directory to search files for sync\n file_pattern (Optional[str]): The regular expression to specify files for sync in a chosen Folder Path\n """\n self.user = check.str_param(user, "user")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication),\n )\n self.file_types = check.opt_str_param(file_types, "file_types")\n self.folder_path = check.opt_str_param(folder_path, "folder_path")\n self.file_pattern = check.opt_str_param(file_pattern, "file_pattern")\n super().__init__("Sftp", name)
\n\n\n
[docs]class WhiskyHunterSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n ):\n """Airbyte Source for Whisky Hunter.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter\n\n Args:\n name (str): The name of the destination.\n\n """\n super().__init__("Whisky Hunter", name)
\n\n\n
[docs]class FreshdeskSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n requests_per_minute: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Freshdesk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk\n\n Args:\n name (str): The name of the destination.\n domain (str): Freshdesk domain\n api_key (str): Freshdesk API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (Optional[str]): UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Freshdesk", name)
\n\n\n
[docs]class GocardlessSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n gocardless_environment: str,\n gocardless_version: str,\n start_date: str,\n ):\n """Airbyte Source for Gocardless.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless\n\n Args:\n name (str): The name of the destination.\n access_token (str): Gocardless API TOKEN\n gocardless_environment (str): Environment you are trying to connect to.\n gocardless_version (str): GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.gocardless_environment = check.str_param(\n gocardless_environment, "gocardless_environment"\n )\n self.gocardless_version = check.str_param(gocardless_version, "gocardless_version")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gocardless", name)
\n\n\n
[docs]class ZuoraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n tenant_endpoint: str,\n data_query: str,\n client_id: str,\n client_secret: str,\n window_in_days: Optional[str] = None,\n ):\n """Airbyte Source for Zuora.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start Date in format: YYYY-MM-DD\n window_in_days (Optional[str]): The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).\n tenant_endpoint (str): Please choose the right endpoint where your Tenant is located. More info by this Link\n data_query (str): Choose between `Live`, or `Unlimited` - the optimized, replicated database at 12 hours freshness for high volume extraction Link\n client_id (str): Your OAuth user Client ID\n client_secret (str): Your OAuth user Client Secret\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.window_in_days = check.opt_str_param(window_in_days, "window_in_days")\n self.tenant_endpoint = check.str_param(tenant_endpoint, "tenant_endpoint")\n self.data_query = check.str_param(data_query, "data_query")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Zuora", name)
\n\n\n
[docs]class MarketoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, domain_url: str, client_id: str, client_secret: str, start_date: str\n ):\n """Airbyte Source for Marketo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo\n\n Args:\n name (str): The name of the destination.\n domain_url (str): Your Marketo Base URL. See the docs for info on how to obtain this.\n client_id (str): The Client ID of your Marketo developer application. See the docs for info on how to obtain this.\n client_secret (str): The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_url = check.str_param(domain_url, "domain_url")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Marketo", name)
\n\n\n
[docs]class DriftSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: str,\n credentials: Optional[str] = None,\n ):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["DriftSource.OAuth20", "DriftSource.AccessToken"]\n ):\n """Airbyte Source for Drift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/drift\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (DriftSource.OAuth20, DriftSource.AccessToken)\n )\n super().__init__("Drift", name)
\n\n\n
[docs]class PokeapiSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, pokemon_name: str):\n """Airbyte Source for Pokeapi.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi\n\n Args:\n name (str): The name of the destination.\n pokemon_name (str): Pokemon requested from the API.\n """\n self.pokemon_name = check.str_param(pokemon_name, "pokemon_name")\n super().__init__("Pokeapi", name)
\n\n\n
[docs]class NetsuiteSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n realm: str,\n consumer_key: str,\n consumer_secret: str,\n token_key: str,\n token_secret: str,\n start_datetime: str,\n object_types: Optional[List[str]] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Netsuite.\n\n Args:\n name (str): The name of the destination.\n realm (str): Netsuite realm e.g. 2344535, as for `production` or 2344535_SB1, as for the `sandbox`\n consumer_key (str): Consumer key associated with your integration\n consumer_secret (str): Consumer secret associated with your integration\n token_key (str): Access token key\n token_secret (str): Access token secret\n object_types (Optional[List[str]]): The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.\n start_datetime (str): Starting point for your data replication, in format of "YYYY-MM-DDTHH:mm:ssZ"\n window_in_days (Optional[int]): The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.\n """\n self.realm = check.str_param(realm, "realm")\n self.consumer_key = check.str_param(consumer_key, "consumer_key")\n self.consumer_secret = check.str_param(consumer_secret, "consumer_secret")\n self.token_key = check.str_param(token_key, "token_key")\n self.token_secret = check.str_param(token_secret, "token_secret")\n self.object_types = check.opt_nullable_list_param(object_types, "object_types", str)\n self.start_datetime = check.str_param(start_datetime, "start_datetime")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Netsuite", name)
\n\n\n
[docs]class HubplannerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Hubplanner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner\n\n Args:\n name (str): The name of the destination.\n api_key (str): Hubplanner API key. See https://github.com/hubplanner/API#authentication for more details.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Hubplanner", name)
\n\n\n
[docs]class Dv360Source(GeneratedAirbyteSource):\n
[docs] class Oauth2Credentials:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n token_uri: str,\n client_id: str,\n client_secret: str,\n ):\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.token_uri = check.str_param(token_uri, "token_uri")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "Dv360Source.Oauth2Credentials",\n partner_id: int,\n start_date: str,\n end_date: Optional[str] = None,\n filters: Optional[List[str]] = None,\n ):\n """Airbyte Source for Dv 360.\n\n Args:\n name (str): The name of the destination.\n credentials (Dv360Source.Oauth2Credentials): Oauth2 credentials\n partner_id (int): Partner ID\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n filters (Optional[List[str]]): filters for the dimensions. each filter object had 2 keys: 'type' for the name of the dimension to be used as. and 'value' for the value of the filter\n """\n self.credentials = check.inst_param(\n credentials, "credentials", Dv360Source.Oauth2Credentials\n )\n self.partner_id = check.int_param(partner_id, "partner_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.filters = check.opt_nullable_list_param(filters, "filters", str)\n super().__init__("Dv 360", name)
\n\n\n
[docs]class NotionSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_type = "OAuth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, token: str):\n self.auth_type = "token"\n self.token = check.str_param(token, "token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["NotionSource.OAuth20", "NotionSource.AccessToken"],\n ):\n """Airbyte Source for Notion.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/notion\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.\n credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]): Pick an authentication method.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (NotionSource.OAuth20, NotionSource.AccessToken)\n )\n super().__init__("Notion", name)
\n\n\n
[docs]class ZendeskSunshineSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_method = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str, email: str):\n self.auth_method = "api_token"\n self.api_token = check.str_param(api_token, "api_token")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n start_date: str,\n credentials: Union["ZendeskSunshineSource.OAuth20", "ZendeskSunshineSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Sunshine.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine\n\n Args:\n name (str): The name of the destination.\n subdomain (str): The subdomain for your Zendesk Account.\n start_date (str): The date from which you'd like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSunshineSource.OAuth20, ZendeskSunshineSource.APIToken),\n )\n super().__init__("Zendesk Sunshine", name)
\n\n\n
[docs]class PinterestSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.auth_method = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["PinterestSource.OAuth20", "PinterestSource.AccessToken"],\n ):\n """Airbyte Source for Pinterest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest\n\n Args:\n name (str): The name of the destination.\n start_date (str): A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (PinterestSource.OAuth20, PinterestSource.AccessToken)\n )\n super().__init__("Pinterest", name)
\n\n\n
[docs]class MetabaseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_api_url: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n session_token: Optional[str] = None,\n ):\n r"""Airbyte Source for Metabase.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase\n\n Args:\n name (str): The name of the destination.\n instance_api_url (str): URL to your metabase instance API\n session_token (Optional[str]): To generate your session token, you need to run the following command: ``` curl -X POST \\\\ -H "Content-Type: application/json" \\\\ -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\ http://localhost:3000/api/session ``` Then copy the value of the `id` field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.\n """\n self.instance_api_url = check.str_param(instance_api_url, "instance_api_url")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.session_token = check.opt_str_param(session_token, "session_token")\n super().__init__("Metabase", name)
\n\n\n
[docs]class HubspotSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.credentials_title = "OAuth Credentials"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.credentials_title = "API Key Credentials"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] class PrivateAPP:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials_title = "Private App Credentials"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union[\n "HubspotSource.OAuth", "HubspotSource.APIKey", "HubspotSource.PrivateAPP"\n ],\n ):\n """Airbyte Source for Hubspot.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]): Choose how to authenticate to HubSpot.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP),\n )\n super().__init__("Hubspot", name)
\n\n\n
[docs]class HarvestSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaHarvestOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithPersonalAccessToken:\n
[docs] @public\n def __init__(self, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n replication_start_date: str,\n credentials: Union[\n "HarvestSource.AuthenticateViaHarvestOAuth",\n "HarvestSource.AuthenticateWithPersonalAccessToken",\n ],\n ):\n """Airbyte Source for Harvest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest\n\n Args:\n name (str): The name of the destination.\n account_id (str): Harvest account ID. Required for all Harvest requests in pair with Personal Access Token\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]): Choose how to authenticate to Harvest.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n HarvestSource.AuthenticateViaHarvestOAuth,\n HarvestSource.AuthenticateWithPersonalAccessToken,\n ),\n )\n super().__init__("Harvest", name)
\n\n\n
[docs]class GithubSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, access_token: str):\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["GithubSource.OAuthCredentials", "GithubSource.PATCredentials"],\n start_date: str,\n repository: str,\n branch: Optional[str] = None,\n page_size_for_large_streams: Optional[int] = None,\n ):\n """Airbyte Source for Github.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/github\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]): Choose how to authenticate to GitHub\n start_date (str): The date from which you'd like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn't apply to all streams, see the docs for more info\n repository (str): Space-delimited list of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/airbyte airbytehq/another-repo` for multiple repositories.\n branch (Optional[str]): Space-delimited list of GitHub repository branches to pull commits for, e.g. `airbytehq/airbyte/master`. If no branches are specified for a repository, the default branch will be pulled.\n page_size_for_large_streams (Optional[int]): The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (GithubSource.OAuthCredentials, GithubSource.PATCredentials)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.repository = check.str_param(repository, "repository")\n self.branch = check.opt_str_param(branch, "branch")\n self.page_size_for_large_streams = check.opt_int_param(\n page_size_for_large_streams, "page_size_for_large_streams"\n )\n super().__init__("Github", name)
\n\n\n
[docs]class E2eTestSource(GeneratedAirbyteSource):\n
[docs] class SingleSchema:\n
[docs] @public\n def __init__(\n self, stream_name: str, stream_schema: str, stream_duplication: Optional[int] = None\n ):\n self.type = "SINGLE_STREAM"\n self.stream_name = check.str_param(stream_name, "stream_name")\n self.stream_schema = check.str_param(stream_schema, "stream_schema")\n self.stream_duplication = check.opt_int_param(stream_duplication, "stream_duplication")
\n\n
[docs] class MultiSchema:\n
[docs] @public\n def __init__(self, stream_schemas: str):\n self.type = "MULTI_STREAM"\n self.stream_schemas = check.str_param(stream_schemas, "stream_schemas")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n max_messages: int,\n mock_catalog: Union["E2eTestSource.SingleSchema", "E2eTestSource.MultiSchema"],\n type: Optional[str] = None,\n seed: Optional[int] = None,\n message_interval_ms: Optional[int] = None,\n ):\n """Airbyte Source for E2e Test.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test\n\n Args:\n name (str): The name of the destination.\n max_messages (int): Number of records to emit per stream. Min 1. Max 100 billion.\n seed (Optional[int]): When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].\n message_interval_ms (Optional[int]): Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).\n """\n self.type = check.opt_str_param(type, "type")\n self.max_messages = check.int_param(max_messages, "max_messages")\n self.seed = check.opt_int_param(seed, "seed")\n self.message_interval_ms = check.opt_int_param(message_interval_ms, "message_interval_ms")\n self.mock_catalog = check.inst_param(\n mock_catalog, "mock_catalog", (E2eTestSource.SingleSchema, E2eTestSource.MultiSchema)\n )\n super().__init__("E2e Test", name)
\n\n\n
[docs]class MysqlSource(GeneratedAirbyteSource):\n
[docs] class Preferred:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "preferred"
\n\n
[docs] class Required:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "required"
\n\n
[docs] class VerifyCA:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyIdentity:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_identity"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n initial_waiting_seconds: Optional[int] = None,\n server_time_zone: Optional[str] = None,\n ):\n self.method = "CDC"\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )\n self.server_time_zone = check.opt_str_param(server_time_zone, "server_time_zone")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "MysqlSource.Preferred",\n "MysqlSource.Required",\n "MysqlSource.VerifyCA",\n "MysqlSource.VerifyIdentity",\n ],\n replication_method: Union["MysqlSource.Standard", "MysqlSource.LogicalReplicationCDC"],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the database.\n port (int): The port to connect to.\n database (str): The database name.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]): SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.\n replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]): Replication method to use for extracting data from the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n MysqlSource.Preferred,\n MysqlSource.Required,\n MysqlSource.VerifyCA,\n MysqlSource.VerifyIdentity,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MysqlSource.Standard, MysqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mysql", name)
\n\n\n
[docs]class MyHoursSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n email: str,\n password: str,\n start_date: str,\n logs_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for My Hours.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours\n\n Args:\n name (str): The name of the destination.\n email (str): Your My Hours username\n password (str): The password associated to the username\n start_date (str): Start date for collecting time logs\n logs_batch_size (Optional[int]): Pagination size used for retrieving logs in days\n """\n self.email = check.str_param(email, "email")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.logs_batch_size = check.opt_int_param(logs_batch_size, "logs_batch_size")\n super().__init__("My Hours", name)
\n\n\n
[docs]class KyribaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n username: str,\n password: str,\n start_date: str,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Kyriba.\n\n Args:\n name (str): The name of the destination.\n domain (str): Kyriba domain\n username (str): Username to be used in basic auth\n password (str): Password to be used in basic auth\n start_date (str): The date the sync should start from.\n end_date (Optional[str]): The date the sync should end. If let empty the sync will run to the current date.\n """\n self.domain = check.str_param(domain, "domain")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Kyriba", name)
\n\n\n
[docs]class GoogleSearchConsoleSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str, email: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n site_urls: List[str],\n start_date: str,\n authorization: Union[\n "GoogleSearchConsoleSource.OAuth",\n "GoogleSearchConsoleSource.ServiceAccountKeyAuthentication",\n ],\n end_date: Optional[str] = None,\n custom_reports: Optional[str] = None,\n ):\n """Airbyte Source for Google Search Console.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console\n\n Args:\n name (str): The name of the destination.\n site_urls (List[str]): The URLs of the website property attached to your GSC account. Read more here.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.\n """\n self.site_urls = check.list_param(site_urls, "site_urls", str)\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (\n GoogleSearchConsoleSource.OAuth,\n GoogleSearchConsoleSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n super().__init__("Google Search Console", name)
\n\n\n
[docs]class FacebookMarketingSource(GeneratedAirbyteSource):\n
[docs] class InsightConfig:\n
[docs] @public\n def __init__(\n self,\n name: str,\n fields: Optional[List[str]] = None,\n breakdowns: Optional[List[str]] = None,\n action_breakdowns: Optional[List[str]] = None,\n time_increment: Optional[int] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n insights_lookback_window: Optional[int] = None,\n ):\n self.name = check.str_param(name, "name")\n self.fields = check.opt_nullable_list_param(fields, "fields", str)\n self.breakdowns = check.opt_nullable_list_param(breakdowns, "breakdowns", str)\n self.action_breakdowns = check.opt_nullable_list_param(\n action_breakdowns, "action_breakdowns", str\n )\n self.time_increment = check.opt_int_param(time_increment, "time_increment")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n start_date: str,\n access_token: str,\n end_date: Optional[str] = None,\n include_deleted: Optional[bool] = None,\n fetch_thumbnail_images: Optional[bool] = None,\n custom_insights: Optional[List[InsightConfig]] = None,\n page_size: Optional[int] = None,\n insights_lookback_window: Optional[int] = None,\n max_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Facebook Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing\n\n Args:\n name (str): The name of the destination.\n account_id (str): The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.\n start_date (str): The date from which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.\n access_token (str): The value of the access token generated. See the docs for more information\n include_deleted (Optional[bool]): Include data from deleted Campaigns, Ads, and AdSets\n fetch_thumbnail_images (Optional[bool]): In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url\n custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]): A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)\n page_size (Optional[int]): Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n insights_lookback_window (Optional[int]): The attribution window\n max_batch_size (Optional[int]): Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.access_token = check.str_param(access_token, "access_token")\n self.include_deleted = check.opt_bool_param(include_deleted, "include_deleted")\n self.fetch_thumbnail_images = check.opt_bool_param(\n fetch_thumbnail_images, "fetch_thumbnail_images"\n )\n self.custom_insights = check.opt_nullable_list_param(\n custom_insights, "custom_insights", FacebookMarketingSource.InsightConfig\n )\n self.page_size = check.opt_int_param(page_size, "page_size")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n super().__init__("Facebook Marketing", name)
\n\n\n
[docs]class SurveymonkeySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, start_date: str, survey_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Surveymonkey.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey\n\n Args:\n name (str): The name of the destination.\n access_token (str): Access Token for making authenticated requests. See the docs for information on how to generate this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all boards to which you have access will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Surveymonkey", name)
\n\n\n
[docs]class PardotSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n pardot_business_unit_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n is_sandbox: Optional[bool] = None,\n ):\n """Airbyte Source for Pardot.\n\n Args:\n name (str): The name of the destination.\n pardot_business_unit_id (str): Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup\n client_id (str): The Consumer Key that can be found when viewing your app in Salesforce\n client_secret (str): The Consumer Secret that can be found when viewing your app in Salesforce\n refresh_token (str): Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don't know what this is, follow this guide to retrieve it.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter\n is_sandbox (Optional[bool]): Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.\n """\n self.pardot_business_unit_id = check.str_param(\n pardot_business_unit_id, "pardot_business_unit_id"\n )\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n super().__init__("Pardot", name)
\n\n\n
[docs]class FlexportSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Flexport.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport\n\n Args:\n name (str): The name of the destination.\n\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Flexport", name)
\n\n\n
[docs]class ZenefitsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: str):\n """Airbyte Source for Zenefits.\n\n Args:\n name (str): The name of the destination.\n token (str): Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api\n """\n self.token = check.str_param(token, "token")\n super().__init__("Zenefits", name)
\n\n\n
[docs]class KafkaSource(GeneratedAirbyteSource):\n
[docs] class JSON:\n
[docs] @public\n def __init__(self, deserialization_type: Optional[str] = None):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )
\n\n
[docs] class AVRO:\n
[docs] @public\n def __init__(\n self,\n deserialization_type: Optional[str] = None,\n deserialization_strategy: Optional[str] = None,\n schema_registry_url: Optional[str] = None,\n schema_registry_username: Optional[str] = None,\n schema_registry_password: Optional[str] = None,\n ):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )\n self.deserialization_strategy = check.opt_str_param(\n deserialization_strategy, "deserialization_strategy"\n )\n self.schema_registry_url = check.opt_str_param(\n schema_registry_url, "schema_registry_url"\n )\n self.schema_registry_username = check.opt_str_param(\n schema_registry_username, "schema_registry_username"\n )\n self.schema_registry_password = check.opt_str_param(\n schema_registry_password, "schema_registry_password"\n )
\n\n
[docs] class ManuallyAssignAListOfPartitions:\n
[docs] @public\n def __init__(self, topic_partitions: str):\n self.subscription_type = "assign"\n self.topic_partitions = check.str_param(topic_partitions, "topic_partitions")
\n\n
[docs] class SubscribeToAllTopicsMatchingSpecifiedPattern:\n
[docs] @public\n def __init__(self, topic_pattern: str):\n self.subscription_type = "subscribe"\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")
\n\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n MessageFormat: Union["KafkaSource.JSON", "KafkaSource.AVRO"],\n bootstrap_servers: str,\n subscription: Union[\n "KafkaSource.ManuallyAssignAListOfPartitions",\n "KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern",\n ],\n protocol: Union[\n "KafkaSource.PLAINTEXT", "KafkaSource.SASLPLAINTEXT", "KafkaSource.SASLSSL"\n ],\n test_topic: Optional[str] = None,\n group_id: Optional[str] = None,\n max_poll_records: Optional[int] = None,\n polling_time: Optional[int] = None,\n client_id: Optional[str] = None,\n enable_auto_commit: Optional[bool] = None,\n auto_commit_interval_ms: Optional[int] = None,\n client_dns_lookup: Optional[str] = None,\n retry_backoff_ms: Optional[int] = None,\n request_timeout_ms: Optional[int] = None,\n receive_buffer_bytes: Optional[int] = None,\n auto_offset_reset: Optional[str] = None,\n repeated_calls: Optional[int] = None,\n max_records_process: Optional[int] = None,\n ):\n """Airbyte Source for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka\n\n Args:\n name (str): The name of the destination.\n MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]): The serialization used based on this\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]): You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.\n test_topic (Optional[str]): The Topic to test in case the Airbyte can consume messages.\n group_id (Optional[str]): The Group ID is how you distinguish different consumer groups.\n max_poll_records (Optional[int]): The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.\n polling_time (Optional[int]): Amount of time Kafka connector should try to poll for messages.\n protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]): The Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n enable_auto_commit (Optional[bool]): If true, the consumer's offset will be periodically committed in the background.\n auto_commit_interval_ms (Optional[int]): The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.\n client_dns_lookup (Optional[str]): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n retry_backoff_ms (Optional[int]): The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.\n request_timeout_ms (Optional[int]): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n receive_buffer_bytes (Optional[int]): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n auto_offset_reset (Optional[str]): What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer's group, anything else: throw exception to the consumer.\n repeated_calls (Optional[int]): The number of repeated calls to poll() if no messages were received.\n max_records_process (Optional[int]): The Maximum to be processed per execution\n """\n self.MessageFormat = check.inst_param(\n MessageFormat, "MessageFormat", (KafkaSource.JSON, KafkaSource.AVRO)\n )\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.subscription = check.inst_param(\n subscription,\n "subscription",\n (\n KafkaSource.ManuallyAssignAListOfPartitions,\n KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern,\n ),\n )\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.group_id = check.opt_str_param(group_id, "group_id")\n self.max_poll_records = check.opt_int_param(max_poll_records, "max_poll_records")\n self.polling_time = check.opt_int_param(polling_time, "polling_time")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.enable_auto_commit = check.opt_bool_param(enable_auto_commit, "enable_auto_commit")\n self.auto_commit_interval_ms = check.opt_int_param(\n auto_commit_interval_ms, "auto_commit_interval_ms"\n )\n self.client_dns_lookup = check.opt_str_param(client_dns_lookup, "client_dns_lookup")\n self.retry_backoff_ms = check.opt_int_param(retry_backoff_ms, "retry_backoff_ms")\n self.request_timeout_ms = check.opt_int_param(request_timeout_ms, "request_timeout_ms")\n self.receive_buffer_bytes = check.opt_int_param(\n receive_buffer_bytes, "receive_buffer_bytes"\n )\n self.auto_offset_reset = check.opt_str_param(auto_offset_reset, "auto_offset_reset")\n self.repeated_calls = check.opt_int_param(repeated_calls, "repeated_calls")\n self.max_records_process = check.opt_int_param(max_records_process, "max_records_process")\n super().__init__("Kafka", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/sources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.sources"}}, "reconciliation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.reconciliation

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import AssetKey\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster_managed_elements import (\n    ManagedElementCheckResult,\n    ManagedElementDiff,\n    ManagedElementError,\n)\nfrom dagster_managed_elements.types import (\n    SECRET_MASK_VALUE,\n    ManagedElementReconciler,\n    is_key_secret,\n)\nfrom dagster_managed_elements.utils import UNSET, diff_dicts\n\nfrom dagster_airbyte.asset_defs import (\n    AirbyteConnectionMetadata,\n    AirbyteInstanceCacheableAssetsDefinition,\n    _clean_name,\n)\nfrom dagster_airbyte.managed.types import (\n    AirbyteConnection,\n    AirbyteDestination,\n    AirbyteDestinationNamespace,\n    AirbyteSource,\n    AirbyteSyncMode,\n    InitializedAirbyteConnection,\n    InitializedAirbyteDestination,\n    InitializedAirbyteSource,\n)\nfrom dagster_airbyte.resources import AirbyteResource\nfrom dagster_airbyte.utils import is_basic_normalization_operation\n\n\ndef gen_configured_stream_json(\n    source_stream: Mapping[str, Any], user_stream_config: Mapping[str, AirbyteSyncMode]\n) -> Mapping[str, Any]:\n    """Generates an Airbyte API stream defintiion based on the succinct user-provided config and the\n    full stream definition from the source.\n    """\n    config = user_stream_config[source_stream["stream"]["name"]]\n    return deep_merge_dicts(\n        source_stream,\n        {"config": config.to_json()},\n    )\n\n\ndef _ignore_secrets_compare_fn(k: str, _cv: Any, dv: Any) -> Optional[bool]:\n    if is_key_secret(k):\n        return dv == SECRET_MASK_VALUE\n    return None\n\n\ndef _diff_configs(\n    config_dict: Mapping[str, Any], dst_dict: Mapping[str, Any], ignore_secrets: bool = True\n) -> ManagedElementDiff:\n    return diff_dicts(\n        config_dict=config_dict,\n        dst_dict=dst_dict,\n        custom_compare_fn=_ignore_secrets_compare_fn if ignore_secrets else None,\n    )\n\n\ndef diff_sources(\n    config_src: Optional[AirbyteSource],\n    curr_src: Optional[AirbyteSource],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteSource objects."""\n    diff = _diff_configs(\n        config_src.source_configuration if config_src else {},\n        curr_src.source_configuration if curr_src else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_src.name if config_src else curr_src.name if curr_src else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef diff_destinations(\n    config_dst: Optional[AirbyteDestination],\n    curr_dst: Optional[AirbyteDestination],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteDestination objects."""\n    diff = _diff_configs(\n        config_dst.destination_configuration if config_dst else {},\n        curr_dst.destination_configuration if curr_dst else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_dst.name if config_dst else curr_dst.name if curr_dst else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef conn_dict(conn: Optional[AirbyteConnection]) -> Mapping[str, Any]:\n    if not conn:\n        return {}\n    return {\n        "source": conn.source.name if conn.source else "Unknown",\n        "destination": conn.destination.name if conn.destination else "Unknown",\n        "normalize data": conn.normalize_data,\n        "streams": {k: v.to_json() for k, v in conn.stream_config.items()},\n        "destination namespace": (\n            conn.destination_namespace.name\n            if isinstance(conn.destination_namespace, AirbyteDestinationNamespace)\n            else conn.destination_namespace\n        ),\n        "prefix": conn.prefix,\n    }\n\n\nOPTIONAL_STREAM_SETTINGS = ("cursorField", "primaryKey")\n\n\ndef _compare_stream_values(k: str, cv: str, _dv: str):\n    """Don't register a diff for optional stream settings if the value is not set\n    in the user-provided config, this means it will default to the value in the\n    source.\n    """\n    return True if k in OPTIONAL_STREAM_SETTINGS and cv == UNSET else None\n\n\ndef diff_connections(\n    config_conn: Optional[AirbyteConnection], curr_conn: Optional[AirbyteConnection]\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteConnection objects."""\n    diff = diff_dicts(\n        conn_dict(config_conn),\n        conn_dict(curr_conn),\n        custom_compare_fn=_compare_stream_values,\n    )\n    if not diff.is_empty():\n        name = config_conn.name if config_conn else curr_conn.name if curr_conn else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef reconcile_sources(\n    res: AirbyteResource,\n    config_sources: Mapping[str, AirbyteSource],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteSource], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing sources and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_sources: Dict[str, InitializedAirbyteSource] = {}\n    for source_name in set(config_sources.keys()).union(existing_sources.keys()):\n        configured_source = config_sources.get(source_name)\n        existing_source = existing_sources.get(source_name)\n\n        # Ignore sources not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_source and not configured_source:\n            initialized_sources[source_name] = existing_source\n            continue\n\n        diff = diff.join(\n            diff_sources(  # type: ignore\n                configured_source,\n                existing_source.source if existing_source else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_source and (\n            not configured_source or (configured_source.must_be_recreated(existing_source.source))\n        ):\n            initialized_sources[source_name] = existing_source\n            if not dry_run:\n                res.make_request(\n                    endpoint="/sources/delete",\n                    data={"sourceId": existing_source.source_id},\n                )\n            existing_source = None\n\n        if configured_source:\n            defn_id = check.not_none(\n                res.get_source_definition_by_name(configured_source.source_type)\n            )\n            base_source_defn_dict = {\n                "name": configured_source.name,\n                "connectionConfiguration": configured_source.source_configuration,\n            }\n            source_id = ""\n            if existing_source:\n                source_id = existing_source.source_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/sources/update",\n                        data={"sourceId": source_id, **base_source_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/sources/create",\n                                data={\n                                    "sourceDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_source_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    source_id = create_result["sourceId"]\n\n            if source_name in initialized_sources:\n                # Preserve to be able to initialize old connection object\n                initialized_sources[f"{source_name}_old"] = initialized_sources[source_name]\n            initialized_sources[source_name] = InitializedAirbyteSource(\n                source=configured_source,\n                source_id=source_id,\n                source_definition_id=defn_id,\n            )\n    return initialized_sources, diff\n\n\ndef reconcile_destinations(\n    res: AirbyteResource,\n    config_destinations: Mapping[str, AirbyteDestination],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteDestination], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing destinations and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_destinations: Dict[str, InitializedAirbyteDestination] = {}\n    for destination_name in set(config_destinations.keys()).union(existing_destinations.keys()):\n        configured_destination = config_destinations.get(destination_name)\n        existing_destination = existing_destinations.get(destination_name)\n\n        # Ignore destinations not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_destination and not configured_destination:\n            initialized_destinations[destination_name] = existing_destination\n            continue\n\n        diff = diff.join(\n            diff_destinations(  # type: ignore\n                configured_destination,\n                existing_destination.destination if existing_destination else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_destination and (\n            not configured_destination\n            or (configured_destination.must_be_recreated(existing_destination.destination))\n        ):\n            initialized_destinations[destination_name] = existing_destination\n            if not dry_run:\n                res.make_request(\n                    endpoint="/destinations/delete",\n                    data={"destinationId": existing_destination.destination_id},\n                )\n            existing_destination = None\n\n        if configured_destination:\n            defn_id = res.get_destination_definition_by_name(\n                configured_destination.destination_type\n            )\n            base_destination_defn_dict = {\n                "name": configured_destination.name,\n                "connectionConfiguration": configured_destination.destination_configuration,\n            }\n            destination_id = ""\n            if existing_destination:\n                destination_id = existing_destination.destination_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/destinations/update",\n                        data={"destinationId": destination_id, **base_destination_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/destinations/create",\n                                data={\n                                    "destinationDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_destination_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    destination_id = create_result["destinationId"]\n\n            if destination_name in initialized_destinations:\n                # Preserve to be able to initialize old connection object\n                initialized_destinations[f"{destination_name}_old"] = initialized_destinations[\n                    destination_name\n                ]\n            initialized_destinations[destination_name] = InitializedAirbyteDestination(\n                destination=configured_destination,\n                destination_id=destination_id,\n                destination_definition_id=defn_id,\n            )\n    return initialized_destinations, diff\n\n\ndef reconcile_config(\n    res: AirbyteResource,\n    objects: Sequence[AirbyteConnection],\n    dry_run: bool = False,\n    should_delete: bool = False,\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Main entry point for the reconciliation process. Takes a list of AirbyteConnection objects\n    and a pointer to an Airbyte instance and returns a diff, along with applying the diff\n    if dry_run is False.\n    """\n    with res.cache_requests():\n        config_connections = {conn.name: conn for conn in objects}\n        config_sources = {conn.source.name: conn.source for conn in objects}\n        config_dests = {conn.destination.name: conn.destination for conn in objects}\n\n        workspace_id = res.get_default_workspace()\n\n        existing_sources_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/sources/list", data={"workspaceId": workspace_id})\n            ),\n        )\n        existing_dests_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/destinations/list", data={"workspaceId": workspace_id})\n            ),\n        )\n\n        existing_sources: Dict[str, InitializedAirbyteSource] = {\n            source_json["name"]: InitializedAirbyteSource.from_api_json(source_json)\n            for source_json in existing_sources_raw.get("sources", [])\n        }\n        existing_dests: Dict[str, InitializedAirbyteDestination] = {\n            destination_json["name"]: InitializedAirbyteDestination.from_api_json(destination_json)\n            for destination_json in existing_dests_raw.get("destinations", [])\n        }\n\n        # First, remove any connections that need to be deleted, so that we can\n        # safely delete any sources/destinations that are no longer referenced\n        # or that need to be recreated.\n        connections_diff = reconcile_connections_pre(\n            res,\n            config_connections,\n            existing_sources,\n            existing_dests,\n            workspace_id,\n            dry_run,\n            should_delete,\n        )\n\n        all_sources, sources_diff = reconcile_sources(\n            res,\n            config_sources,\n            existing_sources,\n            workspace_id,\n            dry_run,\n            should_delete,\n            ignore_secrets,\n        )\n        all_dests, dests_diff = reconcile_destinations(\n            res, config_dests, existing_dests, workspace_id, dry_run, should_delete, ignore_secrets\n        )\n\n        # Now that we have updated the set of sources and destinations, we can\n        # recreate or update any connections which depend on them.\n        reconcile_connections_post(\n            res,\n            config_connections,\n            all_sources,\n            all_dests,\n            workspace_id,\n            dry_run,\n        )\n\n        return ManagedElementDiff().join(sources_diff).join(dests_diff).join(connections_diff)  # type: ignore\n\n\ndef reconcile_normalization(\n    res: AirbyteResource,\n    existing_connection_id: Optional[str],\n    destination: InitializedAirbyteDestination,\n    normalization_config: Optional[bool],\n    workspace_id: str,\n) -> Optional[str]:\n    """Reconciles the normalization configuration for a connection.\n\n    If normalization_config is None, then defaults to True on destinations that support normalization\n    and False on destinations that do not.\n    """\n    existing_basic_norm_op_id = None\n    if existing_connection_id:\n        operations = cast(\n            Dict[str, List[Dict[str, str]]],\n            check.not_none(\n                res.make_request(\n                    endpoint="/operations/list",\n                    data={"connectionId": existing_connection_id},\n                )\n            ),\n        )\n        existing_basic_norm_op = next(\n            (\n                operation\n                for operation in operations["operations"]\n                if is_basic_normalization_operation(operation)\n            ),\n            None,\n        )\n        existing_basic_norm_op_id = (\n            existing_basic_norm_op["operationId"] if existing_basic_norm_op else None\n        )\n\n    if normalization_config is not False:\n        if destination.destination_definition_id and res.does_dest_support_normalization(\n            destination.destination_definition_id, workspace_id\n        ):\n            if existing_basic_norm_op_id:\n                return existing_basic_norm_op_id\n            else:\n                return cast(\n                    Dict[str, str],\n                    check.not_none(\n                        res.make_request(\n                            endpoint="/operations/create",\n                            data={\n                                "workspaceId": workspace_id,\n                                "name": "Normalization",\n                                "operatorConfiguration": {\n                                    "operatorType": "normalization",\n                                    "normalization": {"option": "basic"},\n                                },\n                            },\n                        )\n                    ),\n                )["operationId"]\n        elif normalization_config is True:\n            raise Exception(\n                f"Destination {destination.destination.name} does not support normalization."\n            )\n\n    return None\n\n\ndef reconcile_connections_pre(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n) -> ManagedElementCheckResult:\n    """Generates the diff for connections, and deletes any connections that are not in the config if\n    dry_run is False.\n\n    It's necessary to do this in two steps because we need to remove connections that depend on\n    sources and destinations that are being deleted or recreated before Airbyte will allow us to\n    delete or recreate them.\n    """\n    diff = ManagedElementDiff()\n\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections: Dict[str, InitializedAirbyteConnection] = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, existing_sources, existing_destinations\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name in set(config_connections.keys()).union(existing_connections.keys()):\n        config_conn = config_connections.get(conn_name)\n        existing_conn = existing_connections.get(conn_name)\n\n        # Ignore connections not mentioned in the user config unless the user specifies to delete\n        if not should_delete and not config_conn:\n            continue\n\n        diff = diff.join(\n            diff_connections(config_conn, existing_conn.connection if existing_conn else None)  # type: ignore\n        )\n\n        if existing_conn and (\n            not config_conn or config_conn.must_be_recreated(existing_conn.connection)\n        ):\n            if not dry_run:\n                res.make_request(\n                    endpoint="/connections/delete",\n                    data={"connectionId": existing_conn.connection_id},\n                )\n    return diff\n\n\ndef reconcile_connections_post(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    init_sources: Mapping[str, InitializedAirbyteSource],\n    init_dests: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n) -> None:\n    """Creates new and modifies existing connections based on the config if dry_run is False."""\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, init_sources, init_dests\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name, config_conn in config_connections.items():\n        existing_conn = existing_connections.get(conn_name)\n\n        normalization_operation_id = None\n        if not dry_run:\n            destination = init_dests[config_conn.destination.name]\n\n            # Enable or disable basic normalization based on config\n            normalization_operation_id = reconcile_normalization(\n                res,\n                existing_connections.get("name", {}).get("connectionId"),\n                destination,\n                config_conn.normalize_data,\n                workspace_id,\n            )\n\n        configured_streams = []\n        if not dry_run:\n            source = init_sources[config_conn.source.name]\n            schema = res.get_source_schema(source.source_id)\n            base_streams = schema["catalog"]["streams"]\n\n            configured_streams = [\n                gen_configured_stream_json(stream, config_conn.stream_config)\n                for stream in base_streams\n                if stream["stream"]["name"] in config_conn.stream_config\n            ]\n\n        connection_base_json = {\n            "name": conn_name,\n            "namespaceDefinition": "source",\n            "namespaceFormat": "${SOURCE_NAMESPACE}",\n            "prefix": "",\n            "operationIds": [normalization_operation_id] if normalization_operation_id else [],\n            "syncCatalog": {"streams": configured_streams},\n            "scheduleType": "manual",\n            "status": "active",\n        }\n\n        if isinstance(config_conn.destination_namespace, AirbyteDestinationNamespace):\n            connection_base_json["namespaceDefinition"] = config_conn.destination_namespace.value\n        else:\n            connection_base_json["namespaceDefinition"] = "customformat"\n            connection_base_json["namespaceFormat"] = cast(str, config_conn.destination_namespace)\n\n        if config_conn.prefix:\n            connection_base_json["prefix"] = config_conn.prefix\n\n        if existing_conn:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                res.make_request(\n                    endpoint="/connections/update",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "connectionId": existing_conn.connection_id,\n                    },\n                )\n        else:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                destination = init_dests[config_conn.destination.name]\n\n                res.make_request(\n                    endpoint="/connections/create",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "sourceId": source.source_id,\n                        "destinationId": destination.destination_id,\n                    },\n                )\n\n\n
[docs]@experimental\nclass AirbyteManagedElementReconciler(ManagedElementReconciler):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\n CLI will allow you to check the state of your Python-code-specified Airbyte connections\n against an Airbyte instance, and reconcile them if necessary.\n\n This functionality is experimental and subject to change.\n """\n\n
[docs] @public\n def __init__(\n self,\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n delete_unmentioned_resources: bool = False,\n ):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): The Airbyte resource definition to reconcile against.\n connections (Iterable[AirbyteConnection]): The Airbyte connection objects to reconcile.\n delete_unmentioned_resources (bool): Whether to delete resources that are not mentioned in\n the set of connections provided. When True, all Airbyte instance contents are effectively\n managed by the reconciler. Defaults to False.\n """\n # airbyte = check.inst_param(airbyte, "airbyte", ResourceDefinition)\n\n self._airbyte_instance: AirbyteResource = (\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n )\n self._connections = list(\n check.iterable_param(connections, "connections", of_type=AirbyteConnection)\n )\n self._delete_unmentioned_resources = check.bool_param(\n delete_unmentioned_resources, "delete_unmentioned_resources"\n )\n\n super().__init__()
\n\n def check(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=True,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )\n\n def apply(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=False,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )
\n\n\nclass AirbyteManagedElementCacheableAssetsDefinition(AirbyteInstanceCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: AirbyteResource,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connections: Iterable[AirbyteConnection],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n ):\n defined_conn_names = {conn.name for conn in connections}\n super().__init__(\n airbyte_resource_def=airbyte_resource_def,\n workspace_id=None,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=lambda conn: conn.name in defined_conn_names,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )\n self._connections: List[AirbyteConnection] = list(connections)\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n diff = reconcile_config(self._airbyte_instance, self._connections, dry_run=True)\n if isinstance(diff, ManagedElementDiff) and not diff.is_empty():\n raise ValueError(\n "Airbyte connections are not in sync with provided configuration, diff:\\n{}".format(\n str(diff)\n )\n )\n elif isinstance(diff, ManagedElementError):\n raise ValueError(f"Error checking Airbyte connections: {diff}")\n\n return super()._get_connections()\n\n\n
[docs]@experimental\ndef load_assets_from_connections(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\n This method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): An AirbyteResource configured with the appropriate connection\n details.\n connections (Iterable[AirbyteConnection]): A list of AirbyteConnection objects to build assets for.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function which\n takes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster_airbyte import (\n AirbyteConnection,\n AirbyteResource,\n load_assets_from_connections,\n )\n\n airbyte_instance = AirbyteResource(\n host: "localhost",\n port: "8000",\n )\n airbyte_connections = [\n AirbyteConnection(...),\n AirbyteConnection(...)\n ]\n airbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteManagedElementCacheableAssetsDefinition(\n airbyte_resource_def=(\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n ),\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=check.bool_param(\n create_assets_for_normalization_tables, "create_assets_for_normalization_tables"\n ),\n connection_to_group_fn=check.opt_callable_param(\n connection_to_group_fn, "connection_to_group_fn"\n ),\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connections=check.iterable_param(connections, "connections", of_type=AirbyteConnection),\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/managed/reconciliation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.reconciliation"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.types

\nimport json\nfrom abc import ABC\nfrom enum import Enum\nfrom typing import Any, Dict, List, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\n\n
[docs]class AirbyteSyncMode(ABC):\n """Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\n from a source and writes to a destination.\n\n For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.\n """\n\n def __eq__(self, other: Any) -> bool:\n return isinstance(other, AirbyteSyncMode) and self.to_json() == other.to_json()\n\n def __init__(self, json_repr: Dict[str, Any]):\n self.json_repr = json_repr\n\n def to_json(self) -> Dict[str, Any]:\n return self.json_repr\n\n @classmethod\n def from_json(cls, json_repr: Dict[str, Any]) -> "AirbyteSyncMode":\n return cls(\n {\n k: v\n for k, v in json_repr.items()\n if k in ("syncMode", "destinationSyncMode", "cursorField", "primaryKey")\n }\n )\n\n
[docs] @public\n @classmethod\n def full_refresh_append(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, appending rows to the destination.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "append"})
\n\n
[docs] @public\n @classmethod\n def full_refresh_overwrite(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, replaces data in the destination by\n overwriting it.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "overwrite"})
\n\n
[docs] @public\n @classmethod\n def incremental_append(\n cls,\n cursor_field: Optional[str] = None,\n ) -> "AirbyteSyncMode":\n """Syncs only new records from the source, appending rows to the destination.\n May optionally specify the cursor field used to determine which records\n are new.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n }\n )
\n\n
[docs] @public\n @classmethod\n def incremental_append_dedup(\n cls,\n cursor_field: Optional[str] = None,\n primary_key: Optional[Union[str, List[str]]] = None,\n ) -> "AirbyteSyncMode":\n """Syncs new records from the source, appending to an append-only history\n table in the destination. Also generates a deduplicated view mirroring the\n source table. May optionally specify the cursor field used to determine\n which records are new, and the primary key used to determine which records\n are duplicates.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n if isinstance(primary_key, str):\n primary_key = [primary_key]\n primary_key = check.opt_list_param(primary_key, "primary_key", of_type=str)\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append_dedup",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n **({"primaryKey": [[x] for x in primary_key]} if primary_key else {}),\n }\n )
\n\n\n
[docs]class AirbyteSource:\n """Represents a user-defined Airbyte source.\n\n Args:\n name (str): The display name of the source.\n source_type (str): The type of the source, from Airbyte's list\n of sources https://airbytehq.github.io/category/sources/.\n source_configuration (Mapping[str, Any]): The configuration for the\n source, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(self, name: str, source_type: str, source_configuration: Mapping[str, Any]):\n self.name = check.str_param(name, "name")\n self.source_type = check.str_param(source_type, "source_type")\n self.source_configuration = check.mapping_param(\n source_configuration, "source_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteSource") -> bool:\n return self.name != other.name or self.source_type != other.source_type
\n\n\nclass InitializedAirbyteSource:\n """User-defined Airbyte source bound to actual created Airbyte source."""\n\n def __init__(self, source: AirbyteSource, source_id: str, source_definition_id: Optional[str]):\n self.source = source\n self.source_id = source_id\n self.source_definition_id = source_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n source=AirbyteSource(\n name=api_json["name"],\n source_type=api_json["sourceName"],\n source_configuration=api_json["connectionConfiguration"],\n ),\n source_id=api_json["sourceId"],\n source_definition_id=None,\n )\n\n\n
[docs]class AirbyteDestination:\n """Represents a user-defined Airbyte destination.\n\n Args:\n name (str): The display name of the destination.\n destination_type (str): The type of the destination, from Airbyte's list\n of destinations https://airbytehq.github.io/category/destinations/.\n destination_configuration (Mapping[str, Any]): The configuration for the\n destination, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(\n self, name: str, destination_type: str, destination_configuration: Mapping[str, Any]\n ):\n self.name = check.str_param(name, "name")\n self.destination_type = check.str_param(destination_type, "destination_type")\n self.destination_configuration = check.mapping_param(\n destination_configuration, "destination_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteDestination") -> bool:\n return self.name != other.name or self.destination_type != other.destination_type
\n\n\nclass InitializedAirbyteDestination:\n """User-defined Airbyte destination bound to actual created Airbyte destination."""\n\n def __init__(\n self,\n destination: AirbyteDestination,\n destination_id: str,\n destination_definition_id: Optional[str],\n ):\n self.destination = destination\n self.destination_id = destination_id\n self.destination_definition_id = destination_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n destination=AirbyteDestination(\n name=api_json["name"],\n destination_type=api_json["destinationName"],\n destination_configuration=api_json["connectionConfiguration"],\n ),\n destination_id=api_json["destinationId"],\n destination_definition_id=None,\n )\n\n\nclass AirbyteDestinationNamespace(Enum):\n """Represents the sync mode for a given Airbyte stream."""\n\n SAME_AS_SOURCE = "source"\n DESTINATION_DEFAULT = "destination"\n\n\n
[docs]class AirbyteConnection:\n """A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\n which streams to sync.\n\n Args:\n name (str): The display name of the connection.\n source (AirbyteSource): The source to sync from.\n destination (AirbyteDestination): The destination to sync to.\n stream_config (Mapping[str, AirbyteSyncMode]): A mapping from stream name to\n the sync mode for that stream, including any additional configuration\n of primary key or cursor field.\n normalize_data (Optional[bool]): Whether to normalize the data in the\n destination.\n destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]):\n The namespace to sync to in the destination. If set to\n AirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\n same as the source namespace. If set to\n AirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\n the default namespace for the destination. If set to a string, the\n namespace will be that string.\n prefix (Optional[str]): A prefix to add to the table names in the destination.\n\n Example:\n .. code-block:: python\n\n from dagster_airbyte.managed.generated.sources import FileSource\n from dagster_airbyte.managed.generated.destinations import LocalJsonDestination\n from dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\n cereals_csv_source = FileSource(...)\n local_json_destination = LocalJsonDestination(...)\n\n cereals_connection = AirbyteConnection(\n name="download-cereals",\n source=cereals_csv_source,\n destination=local_json_destination,\n stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n )\n """\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n source: AirbyteSource,\n destination: AirbyteDestination,\n stream_config: Mapping[str, AirbyteSyncMode],\n normalize_data: Optional[bool] = None,\n destination_namespace: Optional[\n Union[AirbyteDestinationNamespace, str]\n ] = AirbyteDestinationNamespace.SAME_AS_SOURCE,\n prefix: Optional[str] = None,\n ):\n self.name = check.str_param(name, "name")\n self.source = check.inst_param(source, "source", AirbyteSource)\n self.destination = check.inst_param(destination, "destination", AirbyteDestination)\n self.stream_config = check.mapping_param(\n stream_config, "stream_config", key_type=str, value_type=AirbyteSyncMode\n )\n self.normalize_data = check.opt_bool_param(normalize_data, "normalize_data")\n self.destination_namespace = check.opt_inst_param(\n destination_namespace, "destination_namespace", (str, AirbyteDestinationNamespace)\n )\n self.prefix = check.opt_str_param(prefix, "prefix")
\n\n def must_be_recreated(self, other: Optional["AirbyteConnection"]) -> bool:\n return (\n not other\n or self.source.must_be_recreated(other.source)\n or self.destination.must_be_recreated(other.destination)\n )
\n\n\nclass InitializedAirbyteConnection:\n """User-defined Airbyte connection bound to actual created Airbyte connection."""\n\n def __init__(\n self,\n connection: AirbyteConnection,\n connection_id: str,\n ):\n self.connection = connection\n self.connection_id = connection_id\n\n @classmethod\n def from_api_json(\n cls,\n api_dict: Mapping[str, Any],\n init_sources: Mapping[str, InitializedAirbyteSource],\n init_dests: Mapping[str, InitializedAirbyteDestination],\n ):\n source = next(\n (\n source.source\n for source in init_sources.values()\n if source.source_id == api_dict["sourceId"]\n ),\n None,\n )\n dest = next(\n (\n dest.destination\n for dest in init_dests.values()\n if dest.destination_id == api_dict["destinationId"]\n ),\n None,\n )\n\n source = check.not_none(source, f"Could not find source with id {api_dict['sourceId']}")\n dest = check.not_none(\n dest, f"Could not find destination with id {api_dict['destinationId']}"\n )\n\n streams = {\n stream["stream"]["name"]: AirbyteSyncMode.from_json(stream["config"])\n for stream in api_dict["syncCatalog"]["streams"]\n }\n return cls(\n AirbyteConnection(\n name=api_dict["name"],\n source=source,\n destination=dest,\n stream_config=streams,\n normalize_data=len(api_dict["operationIds"]) > 0,\n destination_namespace=(\n api_dict["namespaceFormat"]\n if api_dict["namespaceDefinition"] == "customformat"\n else AirbyteDestinationNamespace(api_dict["namespaceDefinition"])\n ),\n prefix=api_dict["prefix"] if api_dict.get("prefix") else None,\n ),\n api_dict["connectionId"],\n )\n\n\ndef _remove_none_values(obj: Dict[str, Any]) -> Dict[str, Any]:\n return {k: v for k, v in obj.items() if v is not None}\n\n\ndef _dump_class(obj: Any) -> Dict[str, Any]:\n return json.loads(json.dumps(obj, default=lambda o: _remove_none_values(o.__dict__)))\n\n\nclass GeneratedAirbyteSource(AirbyteSource):\n """Base class used by the codegen Airbyte sources. This class is not intended to be used directly.\n\n Converts all of its attributes into a source configuration dict which is passed down to the base\n AirbyteSource class.\n """\n\n def __init__(self, source_type: str, name: str):\n source_configuration = _dump_class(self)\n super().__init__(\n name=name, source_type=source_type, source_configuration=source_configuration\n )\n\n\nclass GeneratedAirbyteDestination(AirbyteDestination):\n """Base class used by the codegen Airbyte destinations. This class is not intended to be used directly.\n\n Converts all of its attributes into a destination configuration dict which is passed down to the\n base AirbyteDestination class.\n """\n\n def __init__(self, source_type: str, name: str):\n destination_configuration = _dump_class(self)\n super().__init__(\n name=name,\n destination_type=source_type,\n destination_configuration=destination_configuration,\n )\n
", "current_page_name": "_modules/dagster_airbyte/managed/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.types"}}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.ops

\nfrom typing import Any, Iterable, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import _get_attempt, generate_materializations\n\nfrom .resources import DEFAULT_POLL_INTERVAL_SECONDS, BaseAirbyteResource\n\n\nclass AirbyteSyncConfig(Config):\n    connection_id: str = Field(\n        ...,\n        description=(\n            "Parsed json dictionary representing the details of the Airbyte connector after the"\n            " sync successfully completes. See the [Airbyte API"\n            " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n            " to see detailed information on this response."\n        ),\n    )\n    poll_interval: float = Field(\n        DEFAULT_POLL_INTERVAL_SECONDS,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    poll_timeout: Optional[float] = Field(\n        None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        True,\n        description=(\n            "If True, materializations corresponding to the results of the Airbyte sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        ["airbyte"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description=(\n "Parsed json dictionary representing the details of the Airbyte connector after the"\n " sync successfully completes. See the [Airbyte API"\n " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n " to see detailed information on this response."\n ),\n ),\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(\n context, config: AirbyteSyncConfig, airbyte: BaseAirbyteResource\n) -> Iterable[Any]:\n """Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n airbyte_output = airbyte.sync_and_poll(\n connection_id=config.connection_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(\n airbyte_output,\n metadata={\n **_get_attempt(airbyte_output.job_details.get("attempts", [{}])[-1]).get(\n "totalStats", {}\n )\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.resources

\nimport hashlib\nimport json\nimport logging\nimport sys\nimport time\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, List, Mapping, Optional, cast\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom dagster_airbyte.types import AirbyteOutput\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\nclass AirbyteResourceState:\n    def __init__(self) -> None:\n        self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}\n        # Int in case we nest contexts\n        self.cache_enabled = 0\n\n\nclass BaseAirbyteResource(ConfigurableResource):\n    request_max_retries: int = Field(\n        default=3,\n        description=(\n            "The maximum number of times requests to the Airbyte API should be retried "\n            "before failing."\n        ),\n    )\n    request_retry_delay: float = Field(\n        default=0.25,\n        description="Time (in seconds) to wait between each request retry.",\n    )\n    request_timeout: int = Field(\n        default=15,\n        description="Time (in seconds) after which the requests to Airbyte are declared timed out.",\n    )\n    cancel_sync_on_run_termination: bool = Field(\n        default=True,\n        description=(\n            "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"\n            " be useful to disable if using Airbyte sources that cannot be cancelled and"\n            " resumed easily, or if your Dagster deployment may experience runner interruptions"\n            " that do not impact your Airbyte deployment."\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL_SECONDS,\n        description="Time (in seconds) to wait between checking a sync's status.",\n    )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    @property\n    @cached_method\n    def _log(self) -> logging.Logger:\n        return get_dagster_logger()\n\n    @property\n    @abstractmethod\n    def api_base_url(self) -> str:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        raise NotImplementedError()\n\n    def make_request(\n        self, endpoint: str, data: Optional[Mapping[str, object]] = None, method: str = "POST"\n    ) -> Optional[Mapping[str, object]]:\n        """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n        Args:\n            endpoint (str): The Airbyte API endpoint to send this request to.\n            data (Optional[str]): JSON-formatted data string to be included in the request.\n\n        Returns:\n            Optional[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        url = self.api_base_url + endpoint\n        headers = {"accept": "application/json"}\n\n        num_retries = 0\n        while True:\n            try:\n                request_args: Dict[str, Any] = dict(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    timeout=self.request_timeout,\n                )\n                if data:\n                    request_args["json"] = data\n\n                request_args = deep_merge_dicts(\n                    request_args,\n                    self.all_additional_request_params,\n                )\n\n                response = requests.request(\n                    **request_args,\n                )\n                response.raise_for_status()\n                if response.status_code == 204:\n                    return None\n                return response.json()\n            except RequestException as e:\n                self._log.error("Request to Airbyte API failed: %s", e)\n                if num_retries == self.request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self.request_retry_delay)\n\n        raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n    @abstractmethod\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def cancel_job(self, job_id: int):\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def _should_forward_logs(self) -> bool:\n        raise NotImplementedError()\n\n    def sync_and_poll(\n        self,\n        connection_id: str,\n        poll_interval: Optional[float] = None,\n        poll_timeout: Optional[float] = None,\n    ) -> AirbyteOutput:\n        """Initializes a sync operation for the given connector, and polls until it completes.\n\n        Args:\n            connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n                "Connection" tab of a given connection in the Arbyte UI.\n            poll_interval (float): The time (in seconds) that will be waited between successive polls.\n            poll_timeout (float): The maximum time that will waited before this operation is timed\n                out. By default, this will never time out.\n\n        Returns:\n            :py:class:`~AirbyteOutput`:\n                Details of the sync job.\n        """\n        connection_details = self.get_connection_details(connection_id)\n        job_details = self.start_sync(connection_id)\n        job_info = cast(Dict[str, object], job_details.get("job", {}))\n        job_id = cast(int, job_info.get("id"))\n\n        self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n        start = time.monotonic()\n        logged_attempts = 0\n        logged_lines = 0\n        state = None\n\n        try:\n            while True:\n                if poll_timeout and start + poll_timeout < time.monotonic():\n                    raise Failure(\n                        f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n                        f" {poll_timeout} seconds"\n                    )\n                time.sleep(poll_interval or self.poll_interval)\n                job_details = self.get_job_status(connection_id, job_id)\n                attempts = cast(List, job_details.get("attempts", []))\n                cur_attempt = len(attempts)\n                # spit out the available Airbyte log info\n                if cur_attempt:\n                    if self._should_forward_logs:\n                        log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n                        for line in log_lines[logged_lines:]:\n                            sys.stdout.write(line + "\\n")\n                            sys.stdout.flush()\n                        logged_lines = len(log_lines)\n\n                    # if there's a next attempt, this one will have no more log messages\n                    if logged_attempts < cur_attempt - 1:\n                        logged_lines = 0\n                        logged_attempts += 1\n\n                job_info = cast(Dict[str, object], job_details.get("job", {}))\n                state = job_info.get("status")\n\n                if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n                    continue\n                elif state == AirbyteState.SUCCEEDED:\n                    break\n                elif state == AirbyteState.ERROR:\n                    raise Failure(f"Job failed: {job_id}")\n                elif state == AirbyteState.CANCELLED:\n                    raise Failure(f"Job was cancelled: {job_id}")\n                else:\n                    raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n        finally:\n            # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n            # the python process\n            if (\n                state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n                and self.cancel_sync_on_run_termination\n            ):\n                self.cancel_job(job_id)\n\n        return AirbyteOutput(job_details=job_details, connection_details=connection_details)\n\n\nclass AirbyteCloudResource(BaseAirbyteResource):\n    """This resource allows users to programatically interface with the Airbyte Cloud API to launch\n    syncs and monitor their progress.\n\n    **Examples:**\n\n    .. code-block:: python\n\n        from dagster import job, EnvVar\n        from dagster_airbyte import AirbyteResource\n\n        my_airbyte_resource = AirbyteCloudResource(\n            api_key=EnvVar("AIRBYTE_API_KEY"),\n        )\n\n        airbyte_assets = build_airbyte_assets(\n            connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n            destination_tables=["releases", "tags", "teams"],\n        )\n\n        defs = Definitions(\n            assets=[airbyte_assets],\n            resources={"airbyte": my_airbyte_resource},\n        )\n    """\n\n    api_key: str = Field(..., description="The Airbyte Cloud API key.")\n\n    @property\n    def api_base_url(self) -> str:\n        return "https://api.airbyte.com/v1"\n\n    @property\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        return {"headers": {"Authorization": f"Bearer {self.api_key}", "User-Agent": "dagster"}}\n\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        job_sync = check.not_none(\n            self.make_request(\n                endpoint="/jobs",\n                data={\n                    "connectionId": connection_id,\n                    "jobType": "sync",\n                },\n            )\n        )\n        return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}\n\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        return {}\n\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))\n        return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}\n\n    def cancel_job(self, job_id: int):\n        self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")\n\n    @property\n    def _should_forward_logs(self) -> bool:\n        # Airbyte Cloud does not support streaming logs yet\n        return False\n\n\n
[docs]class AirbyteResource(BaseAirbyteResource):\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job, EnvVar\n from dagster_airbyte import AirbyteResource\n\n my_airbyte_resource = AirbyteResource(\n host=EnvVar("AIRBYTE_HOST"),\n port=EnvVar("AIRBYTE_PORT"),\n # If using basic auth\n username=EnvVar("AIRBYTE_USERNAME"),\n password=EnvVar("AIRBYTE_PASSWORD"),\n )\n\n airbyte_assets = build_airbyte_assets(\n connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n destination_tables=["releases", "tags", "teams"],\n )\n\n defs = Definitions(\n assets=[airbyte_assets],\n resources={"airbyte": my_airbyte_resource},\n )\n """\n\n host: str = Field(description="The Airbyte server address.")\n port: str = Field(description="Port used for the Airbyte server.")\n username: Optional[str] = Field(default=None, description="Username if using basic auth.")\n password: Optional[str] = Field(default=None, description="Password if using basic auth.")\n use_https: bool = Field(\n default=False, description="Whether to use HTTPS to connect to the Airbyte server."\n )\n forward_logs: bool = Field(\n default=True,\n description=(\n "Whether to forward Airbyte logs to the compute log, can be expensive for"\n " long-running syncs."\n ),\n )\n request_additional_params: Mapping[str, Any] = Field(\n default=dict(),\n description=(\n "Any additional kwargs to pass to the requests library when making requests to Airbyte."\n ),\n )\n\n @property\n @cached_method\n def _state(self) -> AirbyteResourceState:\n return AirbyteResourceState()\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self.use_https else "http://")\n + (f"{self.host}:{self.port}" if self.port else self.host)\n + "/api/v1"\n )\n\n @property\n def _should_forward_logs(self) -> bool:\n return self.forward_logs\n\n @contextmanager\n def cache_requests(self):\n """Context manager that enables caching certain requests to the Airbyte API,\n cleared when the context is exited.\n """\n self.clear_request_cache()\n self._state.cache_enabled += 1\n try:\n yield\n finally:\n self.clear_request_cache()\n self._state.cache_enabled -= 1\n\n def clear_request_cache(self) -> None:\n self._state.request_cache = {}\n\n def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):\n if not self._state.cache_enabled > 0:\n return self.make_request(endpoint, data)\n data_json = json.dumps(data, sort_keys=True)\n sha = hashlib.sha1()\n sha.update(endpoint.encode("utf-8"))\n sha.update(data_json.encode("utf-8"))\n digest = sha.hexdigest()\n\n if digest not in self._state.request_cache:\n self._state.request_cache[digest] = self.make_request(endpoint, data)\n return self._state.request_cache[digest]\n\n @property\n def all_additional_request_params(self) -> Mapping[str, Any]:\n auth_param = (\n {"auth": (self.username, self.password)} if self.username and self.password else {}\n )\n return {**auth_param, **self.request_additional_params}\n\n def make_request(\n self, endpoint: str, data: Optional[Mapping[str, object]]\n ) -> Optional[Mapping[str, object]]:\n """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Optional[Dict[str, Any]]: Parsed json data from the response to this request\n """\n url = self.api_base_url + endpoint\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n **deep_merge_dicts( # type: ignore\n dict(\n method="POST",\n url=url,\n headers=headers,\n json=data,\n timeout=self.request_timeout,\n auth=(\n (self.username, self.password)\n if self.username and self.password\n else None\n ),\n ),\n self.request_additional_params,\n ),\n )\n response.raise_for_status()\n if response.status_code == 204:\n return None\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def cancel_job(self, job_id: int):\n self.make_request(endpoint="/jobs/cancel", data={"id": job_id})\n\n def get_default_workspace(self) -> str:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(\n "workspaces", []\n ),\n )\n return workspaces[0]["workspaceId"]\n\n def get_source_definition_by_name(self, name: str) -> Optional[str]:\n name_lower = name.lower()\n definitions = self.make_request_cached(endpoint="/source_definitions/list", data={})\n\n return next(\n (\n definition["sourceDefinitionId"]\n for definition in definitions["sourceDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_destination_definition_by_name(self, name: str):\n name_lower = name.lower()\n definitions = cast(\n Dict[str, List[Dict[str, str]]],\n check.not_none(\n self.make_request_cached(endpoint="/destination_definitions/list", data={})\n ),\n )\n return next(\n (\n definition["destinationDefinitionId"]\n for definition in definitions["destinationDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_source_catalog_id(self, source_id: str):\n result = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n return result["catalogId"]\n\n def get_source_schema(self, source_id: str) -> Mapping[str, Any]:\n return cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n\n def does_dest_support_normalization(\n self, destination_definition_id: str, workspace_id: str\n ) -> bool:\n # Airbyte API changed source of truth for normalization in PR\n # https://github.com/airbytehq/airbyte/pull/21005\n norm_dest_def_spec: bool = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definition_specifications/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n "workspaceId": workspace_id,\n },\n )\n ),\n ).get("supportsNormalization", False)\n\n norm_dest_def: bool = (\n cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definitions/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n },\n )\n ),\n )\n .get("normalizationConfig", {})\n .get("supported", False)\n )\n\n return any([norm_dest_def_spec, norm_dest_def])\n\n def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n if self.forward_logs:\n return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))\n else:\n # the "list all jobs" endpoint doesn't return logs, which actually makes it much more\n # lightweight for long-running syncs with many logs\n out = check.not_none(\n self.make_request(\n endpoint="/jobs/list",\n data={\n "configTypes": ["sync"],\n "configId": connection_id,\n # sync should be the most recent, so pageSize 5 is sufficient\n "pagination": {"pageSize": 5},\n },\n )\n )\n job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)\n\n return check.not_none(job)\n\n def start_sync(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n )\n\n def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n )\n\n def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: Optional[float] = None,\n poll_timeout: Optional[float] = None,\n ) -> AirbyteOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n job_id = cast(int, job_info.get("id"))\n\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n state = None\n\n try:\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n f" {poll_timeout} seconds"\n )\n time.sleep(poll_interval or self.poll_interval)\n job_details = self.get_job_status(connection_id, job_id)\n attempts = cast(List, job_details.get("attempts", []))\n cur_attempt = len(attempts)\n # spit out the available Airbyte log info\n if cur_attempt:\n if self.forward_logs:\n log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n state = job_info.get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n finally:\n # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n # the python process\n if (\n state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n and self.cancel_sync_on_run_termination\n ):\n self.cancel_job(job_id)\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=AirbyteResource.to_config_schema())\ndef airbyte_resource(context) -> AirbyteResource:\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n # If using basic auth\n "username": {"env": "AIRBYTE_USERNAME"},\n "password": {"env": "AIRBYTE_PASSWORD"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource.from_resource_context(context)
\n\n\n@dagster_maintained_resource\n@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))\ndef airbyte_cloud_resource(context) -> AirbyteCloudResource:\n """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch\n syncs and monitor their progress. Currently, this resource may only be used with the more basic\n `dagster-airbyte` APIs, including the ops and assets.\n\n """\n return AirbyteCloudResource.from_resource_context(context)\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_asset_factory

\nfrom typing import AbstractSet, List, Mapping, Optional, Set, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    AssetKey,\n    AssetsDefinition,\n    GraphDefinition,\n    OutputMapping,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster._core.definitions.graph_definition import create_adjacency_lists\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.utils import (\n    DagsterAirflowError,\n    normalized_name,\n)\n\n\ndef _build_asset_dependencies(\n    dag: DAG,\n    graph: GraphDefinition,\n    task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]],\n    upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]],\n) -> Tuple[AbstractSet[OutputMapping], Mapping[str, AssetKey], Mapping[str, Set[AssetKey]]]:\n    """Builds the asset dependency graph for a given set of airflow task mappings and a dagster graph."""\n    output_mappings = set()\n    keys_by_output_name = {}\n    internal_asset_deps: dict[str, Set[AssetKey]] = {}\n\n    visited_nodes: dict[str, bool] = {}\n    upstream_deps = set()\n\n    def find_upstream_dependency(node_name: str) -> None:\n        """Uses Depth-Firs-Search to find all upstream asset dependencies\n        as described in task_ids_by_asset_key.\n        """\n        # node has been visited\n        if visited_nodes[node_name]:\n            return\n        # mark node as visted\n        visited_nodes[node_name] = True\n        # traverse upstream nodes\n        for output_handle in graph.dependency_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = output_handle.node_name\n            match = False\n            # find any assets produced by upstream nodes and add them to the internal asset deps\n            for asset_key in task_ids_by_asset_key:\n                if (\n                    forward_node.replace(f"{normalized_name(dag.dag_id)}__", "")\n                    in task_ids_by_asset_key[asset_key]\n                ):\n                    upstream_deps.add(asset_key)\n                    match = True\n            # don't traverse past nodes that have assets\n            if not match:\n                find_upstream_dependency(forward_node)\n\n    # iterate through each asset to find all upstream asset dependencies\n    for asset_key in task_ids_by_asset_key:\n        asset_upstream_deps = set()\n        for task_id in task_ids_by_asset_key[asset_key]:\n            visited_nodes = {s.name: False for s in graph.nodes}\n            upstream_deps = set()\n            find_upstream_dependency(normalized_name(dag.dag_id, task_id))\n            for dep in upstream_deps:\n                asset_upstream_deps.add(dep)\n            keys_by_output_name[f"result_{normalized_name(dag.dag_id, task_id)}"] = asset_key\n            output_mappings.add(\n                OutputMapping(\n                    graph_output_name=f"result_{normalized_name(dag.dag_id, task_id)}",\n                    mapped_node_name=normalized_name(dag.dag_id, task_id),\n                    mapped_node_output_name="airflow_task_complete",  # Default output name\n                )\n            )\n\n        # the tasks for a given asset should have the same internal deps\n        for task_id in task_ids_by_asset_key[asset_key]:\n            if f"result_{normalized_name(dag.dag_id, task_id)}" in internal_asset_deps:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"].update(\n                    asset_upstream_deps\n                )\n            else:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"] = (\n                    asset_upstream_deps\n                )\n\n    # add new upstream asset dependencies to the internal deps\n    for asset_key in upstream_dependencies_by_asset_key:\n        for key in keys_by_output_name:\n            if keys_by_output_name[key] == asset_key:\n                internal_asset_deps[key].update(upstream_dependencies_by_asset_key[asset_key])\n\n    return (output_mappings, keys_by_output_name, internal_asset_deps)\n\n\n
[docs]def load_assets_from_airflow_dag(\n dag: DAG,\n task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]] = {},\n upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]] = {},\n connections: Optional[List[Connection]] = None,\n) -> List[AssetsDefinition]:\n """[Experimental] Construct Dagster Assets for a given Airflow DAG.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]): A mapping from asset\n keys to task ids. Used break up the Airflow Dag into multiple SDAs\n upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]): A\n mapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\n declare new upstream SDA depenencies.\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n List[AssetsDefinition]\n """\n cron_schedule = dag.normalized_schedule_interval\n if cron_schedule is not None and not is_valid_cron_schedule(str(cron_schedule)):\n raise DagsterAirflowError(f"Invalid cron schedule: {cron_schedule} in DAG {dag.dag_id}")\n\n job = make_dagster_job_from_airflow_dag(dag, connections=connections)\n graph = job._graph_def # noqa: SLF001\n start_date = dag.start_date if dag.start_date else dag.default_args.get("start_date")\n if start_date is None:\n raise DagsterAirflowError(f"Invalid start_date: {start_date} in DAG {dag.dag_id}")\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph.nodes, graph.dependency_structure)\n leaf_nodes = {\n node_name.replace(f"{normalized_name(dag.dag_id)}__", "")\n for node_name, downstream_nodes in forward_edges.items()\n if not downstream_nodes\n }\n\n mutated_task_ids_by_asset_key: dict[AssetKey, set[str]] = {}\n\n if task_ids_by_asset_key is None or task_ids_by_asset_key == {}:\n # if no mappings are provided the dag becomes a single SDA\n task_ids_by_asset_key = {AssetKey(dag.dag_id): leaf_nodes}\n else:\n # if mappings were provide any unmapped leaf nodes are added to a default asset\n used_nodes: set[str] = set()\n for key in task_ids_by_asset_key:\n used_nodes.update(task_ids_by_asset_key[key])\n\n mutated_task_ids_by_asset_key[AssetKey(dag.dag_id)] = leaf_nodes - used_nodes\n\n for key in task_ids_by_asset_key:\n if key not in mutated_task_ids_by_asset_key:\n mutated_task_ids_by_asset_key[key] = set(task_ids_by_asset_key[key])\n else:\n mutated_task_ids_by_asset_key[key].update(task_ids_by_asset_key[key])\n\n output_mappings, keys_by_output_name, internal_asset_deps = _build_asset_dependencies(\n dag, graph, mutated_task_ids_by_asset_key, upstream_dependencies_by_asset_key\n )\n\n new_graph = graph.copy(\n output_mappings=list(output_mappings),\n )\n\n asset_def = AssetsDefinition.from_graph(\n graph_def=new_graph,\n partitions_def=(\n TimeWindowPartitionsDefinition(\n cron_schedule=str(cron_schedule),\n timezone=dag.timezone.name,\n start=start_date.strftime("%Y-%m-%dT%H:%M:%S"),\n fmt="%Y-%m-%dT%H:%M:%S",\n )\n if cron_schedule is not None\n else None\n ),\n group_name=dag.dag_id,\n keys_by_output_name=keys_by_output_name,\n internal_asset_deps=internal_asset_deps,\n can_subset=True,\n )\n return [asset_def]
\n
", "current_page_name": "_modules/dagster_airflow/dagster_asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_asset_factory"}, "dagster_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_factory

\nimport os\nfrom typing import List, Mapping, Optional, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dagbag import DagBag\nfrom dagster import (\n    Definitions,\n    JobDefinition,\n    ResourceDefinition,\n    ScheduleDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.dagster_schedule_factory import (\n    _is_dag_is_schedule,\n    make_dagster_schedule_from_airflow_dag,\n)\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.resources.airflow_ephemeral_db import AirflowEphemeralDatabase\nfrom dagster_airflow.resources.airflow_persistent_db import AirflowPersistentDatabase\nfrom dagster_airflow.utils import (\n    is_airflow_2_loaded_in_environment,\n)\n\n\n
[docs]def make_dagster_definitions_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster definition corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_definition.py`:\n from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_definition_from_dag_bag():\n return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definition.py`\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n schedules, jobs = make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )\n\n return Definitions(\n schedules=schedules,\n jobs=jobs,\n resources=resource_defs,\n )
\n\n\n
[docs]def make_dagster_definitions_from_airflow_dags_path(\n dag_path: str,\n safe_mode: bool = True,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n Usage:\n Create ``make_dagster_definitions.py``:\n\n .. code-block:: python\n\n from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\n def make_definitions_from_dir():\n return make_dagster_definitions_from_airflow_dags_path(\n '/path/to/dags/',\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.str_param(dag_path, "dag_path")\n check.bool_param(safe_mode, "safe_mode")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n if (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowEphemeralDatabase"\n ):\n AirflowEphemeralDatabase._initialize_database(connections=connections) # noqa: SLF001\n elif (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowPersistentDatabase"\n ):\n AirflowPersistentDatabase._initialize_database( # noqa: SLF001\n uri=(\n os.getenv("AIRFLOW__DATABASE__SQL_ALCHEMY_CONN", "")\n if is_airflow_2_loaded_in_environment()\n else os.getenv("AIRFLOW__CORE__SQL_ALCHEMY_CONN", "")\n ),\n connections=connections,\n )\n\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n )\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )
\n\n\ndef make_dagster_definitions_from_airflow_example_dags(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Usage:\n\n Create `make_dagster_definitions.py`:\n from dagster_airflow import make_dagster_definitions_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_definitions_from_airflow_example_dags()\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definitions.py`\n\n Args:\n resource_defs: Optional[Mapping[str, ResourceDefinition]]\n Resource definitions to be used with the definitions\n\n Returns:\n Definitions\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is fixed in airflow v2\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag, resource_defs=resource_defs\n )\n\n\n
[docs]def make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Tuple[List[ScheduleDefinition], List[JobDefinition]]:\n """Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n - List[ScheduleDefinition]: The generated Dagster Schedules\n - List[JobDefinition]: The generated Dagster Jobs\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n job_defs = []\n schedule_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n dag = dag_bag.dags.get(dag_id)\n if not dag:\n continue\n if _is_dag_is_schedule(dag):\n schedule_defs.append(\n make_dagster_schedule_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n else:\n job_defs.append(\n make_dagster_job_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n\n count += 1\n\n return schedule_defs, job_defs
\n
", "current_page_name": "_modules/dagster_airflow/dagster_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_factory"}, "dagster_job_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom typing import List, Mapping, Optional\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    GraphDefinition,\n    JobDefinition,\n    ResourceDefinition,\n    _check as check,\n)\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import IS_AIRFLOW_INGEST_PIPELINE_STR\n\nfrom dagster_airflow.airflow_dag_converter import get_graph_definition_args\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.utils import (\n    normalized_name,\n)\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag: DAG,\n tags: Optional[Mapping[str, str]] = None,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> JobDefinition:\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagster UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n connections (List[Connection]): List of Airflow Connections to be created in the Ephemeral\n Airflow DB, if use_emphemeral_airflow_db is False this will be ignored.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_mapping_param(tags, "tags")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n mutated_tags = dict(tags)\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n mutated_tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n mutated_tags = validate_tags(mutated_tags)\n\n node_dependencies, node_defs = get_graph_definition_args(dag=dag)\n\n graph_def = GraphDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n node_defs=node_defs,\n dependencies=node_dependencies,\n tags=mutated_tags,\n )\n\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n job_def = JobDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n graph_def=graph_def,\n resource_defs=resource_defs,\n tags=mutated_tags,\n metadata={},\n op_retry_policy=None,\n version_strategy=None,\n )\n return job_def
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_job_factory"}, "operators": {"dagster_operator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.operators.dagster_operator

\nimport json\n\nfrom airflow.models import BaseOperator\nfrom airflow.utils.decorators import apply_defaults\n\nfrom dagster_airflow.hooks.dagster_hook import DagsterHook\nfrom dagster_airflow.links.dagster_link import LINK_FMT, DagsterLink\nfrom dagster_airflow.utils import is_airflow_2_loaded_in_environment\n\n\n
[docs]class DagsterOperator(BaseOperator):\n """DagsterOperator.\n\n Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """\n\n template_fields = ["run_config"]\n template_ext = (".yaml", ".yml", ".json")\n ui_color = "#663399"\n ui_fgcolor = "#e0e3fc"\n operator_extra_links = (DagsterLink(),)\n\n @apply_defaults\n def __init__(\n self,\n dagster_conn_id="dagster_default",\n run_config=None,\n repository_name="",\n repostitory_location_name="",\n job_name="",\n # params for airflow < 2.0.0 were custom connections aren't supported\n deployment_name="prod",\n user_token=None,\n organization_id="",\n url="https://dagster.cloud/",\n *args,\n **kwargs,\n ) -> None:\n super().__init__(*args, **kwargs)\n self.run_id = None\n self.dagster_conn_id = dagster_conn_id if is_airflow_2_loaded_in_environment() else None\n self.run_config = run_config or {}\n self.repository_name = repository_name\n self.repostitory_location_name = repostitory_location_name\n self.job_name = job_name\n\n self.user_token = user_token\n self.url = url\n self.organization_id = organization_id\n self.deployment_name = deployment_name\n\n self.hook = DagsterHook(\n dagster_conn_id=self.dagster_conn_id,\n user_token=self.user_token,\n url=f"{self.url}{self.organization_id}/{self.deployment_name}/graphql",\n )\n\n def _is_json(self, blob):\n try:\n json.loads(blob)\n except ValueError:\n return False\n return True\n\n def pre_execute(self, context):\n # force re-rendering to ensure run_config renders any templated\n # content from run_config that couldn't be accessed on init\n setattr(\n self,\n "run_config",\n self.render_template(self.run_config, context),\n )\n\n def on_kill(self):\n self.log.info("Terminating Run")\n self.hook.terminate_run(\n run_id=self.run_id,\n )\n\n def execute(self, context):\n try:\n return self._execute(context)\n except Exception as e:\n raise e\n\n def _execute(self, context):\n self.run_id = self.hook.launch_run(\n repository_name=self.repository_name,\n repostitory_location_name=self.repostitory_location_name,\n job_name=self.job_name,\n run_config=self.run_config,\n )\n # save relevant info in xcom for use in links\n context["task_instance"].xcom_push(key="run_id", value=self.run_id)\n context["task_instance"].xcom_push(\n key="organization_id",\n value=self.hook.organization_id if self.dagster_conn_id else self.organization_id,\n )\n context["task_instance"].xcom_push(\n key="deployment_name",\n value=self.hook.deployment_name if self.dagster_conn_id else self.deployment_name,\n )\n\n self.log.info("Run Starting....")\n self.log.info(\n "Run tracking: %s",\n LINK_FMT.format(\n organization_id=self.hook.organization_id,\n deployment_name=self.hook.deployment_name,\n run_id=self.run_id,\n ),\n )\n self.hook.wait_for_run(\n run_id=self.run_id,\n )
\n\n\n
[docs]class DagsterCloudOperator(DagsterOperator):\n """DagsterCloudOperator.\n\n Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """
\n
", "current_page_name": "_modules/dagster_airflow/operators/dagster_operator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.operators.dagster_operator"}}, "resources": {"airflow_ephemeral_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_ephemeral_db

\nimport importlib\nimport os\nimport tempfile\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom airflow.utils import db\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    Noneable,\n    ResourceDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    Locker,\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowEphemeralDatabase(AirflowDatabase):\n    """A ephemeral Airflow database Dagster resource."""\n\n    def __init__(\n        self, airflow_home_path: str, dagster_run: DagsterRun, dag_run_config: Optional[dict] = None\n    ):\n        self.airflow_home_path = airflow_home_path\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(\n        airflow_home_path: str = os.path.join(tempfile.gettempdir(), "dagster_airflow"),\n        connections: List[Connection] = [],\n    ):\n        os.environ["AIRFLOW_HOME"] = airflow_home_path\n        os.makedirs(airflow_home_path, exist_ok=True)\n        with Locker(airflow_home_path):\n            airflow_initialized = os.path.exists(f"{airflow_home_path}/airflow.db")\n            # because AIRFLOW_HOME has been overriden airflow needs to be reloaded\n            if is_airflow_2_loaded_in_environment():\n                importlib.reload(airflow.configuration)\n                importlib.reload(airflow.settings)\n                importlib.reload(airflow)\n            else:\n                importlib.reload(airflow)\n            if not airflow_initialized:\n                db.initdb()\n                create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowEphemeralDatabase":\n        airflow_home_path = os.path.join(tempfile.gettempdir(), f"dagster_airflow_{context.run_id}")\n        AirflowEphemeralDatabase._initialize_database(\n            airflow_home_path=airflow_home_path,\n            connections=[Connection(**c) for c in context.resource_config["connections"]],\n        )\n        return AirflowEphemeralDatabase(\n            airflow_home_path=airflow_home_path,\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            dag_run_config=context.resource_config.get("dag_run_config"),\n        )\n\n\n
[docs]def make_ephemeral_airflow_db_resource(\n connections: List[Connection] = [], dag_run_config: Optional[dict] = None\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an ephemeral Airflow database.\n\n Args:\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The ephemeral Airflow DB resource\n\n """\n serialized_connections = serialize_connections(connections)\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowEphemeralDatabase.from_resource_context,\n config_schema={\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n Noneable(dict),\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Ephemeral Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_ephemeral_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_ephemeral_db"}, "airflow_persistent_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_persistent_db

\nimport importlib\nimport os\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    ResourceDefinition,\n    StringSource,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowPersistentDatabase(AirflowDatabase):\n    """A persistent Airflow database Dagster resource."""\n\n    def __init__(self, dagster_run: DagsterRun, uri: str, dag_run_config: Optional[dict] = None):\n        self.uri = uri\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(uri: str, connections: List[Connection] = []):\n        if is_airflow_2_loaded_in_environment("2.3.0"):\n            os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow.configuration)\n            importlib.reload(airflow.settings)\n            importlib.reload(airflow)\n        else:\n            os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow)\n        create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowPersistentDatabase":\n        uri = context.resource_config["uri"]\n        AirflowPersistentDatabase._initialize_database(\n            uri=uri, connections=[Connection(**c) for c in context.resource_config["connections"]]\n        )\n        return AirflowPersistentDatabase(\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            uri=uri,\n            dag_run_config=context.resource_config["dag_run_config"],\n        )\n\n\n
[docs]def make_persistent_airflow_db_resource(\n uri: str = "",\n connections: List[Connection] = [],\n dag_run_config: Optional[dict] = {},\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an persistent Airflow database.\n\n\n Usage:\n .. code-block:: python\n\n from dagster_airflow import (\n make_dagster_definitions_from_airflow_dags_path,\n make_persistent_airflow_db_resource,\n )\n postgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\n airflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\n definitions = make_dagster_definitions_from_airflow_example_dags(\n '/path/to/dags/',\n resource_defs={"airflow_db": airflow_db}\n )\n\n\n Args:\n uri: SQLAlchemy URI of the Airflow DB to be used\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The persistent Airflow DB resource\n\n """\n if is_airflow_2_loaded_in_environment():\n os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n else:\n os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n\n serialized_connections = serialize_connections(connections)\n\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowPersistentDatabase.from_resource_context,\n config_schema={\n "uri": Field(\n StringSource,\n default_value=uri,\n is_required=False,\n ),\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n dict,\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Persistent Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_persistent_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_persistent_db"}}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.ecs.launcher

\nimport json\nimport logging\nimport os\nimport uuid\nimport warnings\nfrom collections import namedtuple\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Array,\n    DagsterRunStatus,\n    Field,\n    Noneable,\n    Permissive,\n    ScalarUnion,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import RUN_WORKER_ID_TAG\nfrom dagster._grpc.types import ExecuteRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.backoff import backoff\nfrom typing_extensions import Self\n\nfrom ..secretsmanager import get_secrets_from_arns\nfrom .container_context import SHARED_ECS_SCHEMA, SHARED_TASK_DEFINITION_FIELDS, EcsContainerContext\nfrom .tasks import (\n    DagsterEcsTaskDefinitionConfig,\n    get_current_ecs_task,\n    get_current_ecs_task_metadata,\n    get_task_definition_dict_from_current_task,\n    get_task_kwargs_from_current_task,\n)\nfrom .utils import get_task_definition_family, get_task_logs, task_definitions_match\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\nRUNNING_STATUSES = [\n    "PROVISIONING",\n    "PENDING",\n    "ACTIVATING",\n    "RUNNING",\n    "DEACTIVATING",\n    "STOPPING",\n    "DEPROVISIONING",\n]\nSTOPPED_STATUSES = ["STOPPED"]\n\nDEFAULT_WINDOWS_RESOURCES = {"cpu": "1024", "memory": "2048"}\n\nDEFAULT_LINUX_RESOURCES = {"cpu": "256", "memory": "512"}\n\n\n
[docs]class EcsRunLauncher(RunLauncher[T_DagsterInstance], ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n env_vars=None,\n include_sidecars=False,\n use_current_ecs_task_config: bool = True,\n run_task_kwargs: Optional[Mapping[str, Any]] = None,\n run_resources: Optional[Dict[str, Any]] = None,\n run_ecs_tags: Optional[List[Dict[str, Optional[str]]]] = None,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n self.logs = boto3.client("logs")\n\n self.task_definition = None\n self.task_definition_dict = {}\n if isinstance(task_definition, str):\n self.task_definition = task_definition\n elif task_definition and "env" in task_definition:\n check.invariant(\n len(task_definition) == 1,\n "If `task_definition` is set to a dictionary with `env`, `env` must be the only"\n " key.",\n )\n env_var = task_definition["env"]\n self.task_definition = os.getenv(env_var)\n if not self.task_definition:\n raise Exception(\n f"You have attempted to fetch the environment variable {env_var} which is not"\n " set."\n )\n else:\n self.task_definition_dict = task_definition or {}\n\n self.container_name = container_name\n\n self.secrets = check.opt_list_param(secrets, "secrets")\n\n self.env_vars = check.opt_list_param(env_vars, "env_vars")\n\n if self.secrets and all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = [\n {"name": name, "valueFrom": value_from}\n for name, value_from in get_secrets_from_arns(\n self.secrets_manager, self.secrets\n ).items()\n ]\n\n self.secrets_tags = [secrets_tag] if secrets_tag else []\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n self.use_current_ecs_task_config = check.opt_bool_param(\n use_current_ecs_task_config, "use_current_ecs_task_config"\n )\n\n self.run_task_kwargs = check.opt_mapping_param(run_task_kwargs, "run_task_kwargs")\n if run_task_kwargs:\n check.invariant(\n "taskDefinition" not in run_task_kwargs,\n "Use the `taskDefinition` config field to pass in a task definition to run.",\n )\n check.invariant(\n "overrides" not in run_task_kwargs,\n "Task overrides are set by the run launcher and cannot be set in run_task_kwargs.",\n )\n\n expected_keys = [\n key for key in self.ecs.meta.service_model.shape_for("RunTaskRequest").members\n ]\n\n for key in run_task_kwargs:\n check.invariant(\n key in expected_keys, f"Found an unexpected key {key} in run_task_kwargs"\n )\n\n self.run_resources = check.opt_mapping_param(run_resources, "run_resources")\n\n self.run_ecs_tags = check.opt_sequence_param(run_ecs_tags, "run_ecs_tags")\n\n self._current_task_metadata = None\n self._current_task = None\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def task_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("task_role_arn")\n\n @property\n def execution_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("execution_role_arn")\n\n @property\n def runtime_platform(self) -> Optional[Mapping[str, Any]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("runtime_platform")\n\n @property\n def mount_points(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("mount_points")\n\n @property\n def volumes(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("volumes")\n\n @property\n def repository_credentials(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("repository_credentials")\n\n @property\n def run_sidecar_containers(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("sidecar_containers")\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={\n "log_group": Field(StringSource, is_required=False),\n "sidecar_containers": Field(Array(Permissive({})), is_required=False),\n "requires_compatibilities": Field(Array(str), is_required=False),\n "env": Field(\n str,\n is_required=False,\n description=(\n "Backwards-compatibility for when task_definition was a"\n " StringSource.Can be used to source the task_definition scalar"\n " from an environment variable."\n ),\n ),\n **SHARED_TASK_DEFINITION_FIELDS,\n },\n ),\n is_required=False,\n description=(\n "Either the short name of an existing task definition to use when launching new"\n " tasks, or a dictionary configuration to use when creating a task definition"\n " for the run.If neither is provided, the task definition will be created based"\n " on the current task's task definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variables in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n "use_current_ecs_task_config": Field(\n bool,\n is_required=False,\n default_value=True,\n description=(\n "Whether to use the run launcher's current ECS task in order to determine "\n "the cluster and networking configuration for the launched task. Defaults to "\n "True. Should only be called if the run launcher is running within an ECS "\n "task."\n ),\n ),\n "run_task_kwargs": Field(\n Permissive(\n {\n "cluster": Field(\n StringSource,\n is_required=False,\n description="Name of the ECS cluster to launch ECS tasks in.",\n ),\n }\n ),\n is_required=False,\n description=(\n "Additional arguments to include while running the task. See"\n " https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task"\n " for the available parameters. The overrides and taskDefinition arguments will"\n " always be set by the run launcher."\n ),\n ),\n **SHARED_ECS_SCHEMA,\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_run_tags(self, run_id: str, cluster: str, task_arn: str):\n tags = {\n "ecs/task_arn": task_arn,\n "ecs/cluster": cluster,\n RUN_WORKER_ID_TAG: str(uuid.uuid4().hex)[0:6],\n }\n self._instance.add_run_tags(run_id, tags)\n\n def build_ecs_tags_for_run_task(self, run, container_context: EcsContainerContext):\n if any(tag["key"] == "dagster/run_id" for tag in container_context.run_ecs_tags):\n raise Exception("Cannot override system ECS tag: dagster/run_id")\n\n return [{"key": "dagster/run_id", "value": run.run_id}, *container_context.run_ecs_tags]\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def _get_command_args(self, run_args: ExecuteRunArgs, context: LaunchRunContext):\n return run_args.get_command_args()\n\n def _get_image_for_run(self, context: LaunchRunContext) -> Optional[str]:\n job_origin = check.not_none(context.job_code_origin)\n return job_origin.repository_origin.container_image\n\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run in an ECS task."""\n run = context.dagster_run\n container_context = EcsContainerContext.create_for_run(run, self)\n\n job_origin = check.not_none(context.job_code_origin)\n\n # ECS limits overrides to 8192 characters including json formatting\n # https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_RunTask.html\n # When container_context is serialized as part of the ExecuteRunArgs, we risk\n # going over this limit (for example, if many secrets have been set). This strips\n # the container context off of our job origin because we don't actually need\n # it to launch the run; we only needed it to create the task definition.\n repository_origin = job_origin.repository_origin\n\n stripped_repository_origin = repository_origin._replace(container_context={})\n stripped_job_origin = job_origin._replace(repository_origin=stripped_repository_origin)\n\n args = ExecuteRunArgs(\n job_origin=stripped_job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = self._get_command_args(args, context)\n image = self._get_image_for_run(context)\n\n run_task_kwargs = self._run_task_kwargs(run, image, container_context)\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = self.get_cpu_and_memory_overrides(container_context, run)\n\n task_overrides = self._get_task_overrides(container_context, run)\n\n container_overrides: List[Dict[str, Any]] = [\n {\n "name": self._get_container_name(container_context),\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ]\n\n run_task_kwargs["overrides"] = {\n "containerOverrides": container_overrides,\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n **task_overrides,\n }\n run_task_kwargs["tags"] = [\n *run_task_kwargs.get("tags", []),\n *self.build_ecs_tags_for_run_task(run, container_context),\n ]\n\n run_task_kwargs_from_run = self._get_run_task_kwargs_from_run(run)\n run_task_kwargs.update(run_task_kwargs_from_run)\n\n # launchType and capacityProviderStrategy are incompatible - prefer the latter if it is set\n if "launchType" in run_task_kwargs and run_task_kwargs.get("capacityProviderStrategy"):\n del run_task_kwargs["launchType"]\n\n # Run a task using the same network configuration as this processes's task.\n response = self.ecs.run_task(**run_task_kwargs)\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n failure_messages = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n\n failure_message = (\n "Task"\n + (f" {arn}" if arn else "")\n + " failed."\n + (f" Failure reason: {reason}" if reason else "")\n + (f" Failure details: {detail}" if detail else "")\n )\n failure_messages.append(failure_message)\n\n raise Exception("\\n".join(failure_messages) if failure_messages else "Task failed.")\n\n arn = tasks[0]["taskArn"]\n cluster_arn = tasks[0]["clusterArn"]\n self._set_run_tags(run.run_id, cluster=cluster_arn, task_arn=arn)\n self.report_launch_events(run, arn, cluster_arn)\n\n def report_launch_events(\n self, run: DagsterRun, arn: Optional[str] = None, cluster: Optional[str] = None\n ):\n # Extracted method to allow for subclasses to customize the launch reporting behavior\n\n metadata = {}\n if arn:\n metadata["ECS Task ARN"] = arn\n if cluster:\n metadata["ECS Cluster"] = cluster\n\n metadata["Run ID"] = run.run_id\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n dagster_run=run,\n engine_event_data=EngineEventData(metadata),\n cls=self.__class__,\n )\n\n def get_cpu_and_memory_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, str]:\n overrides = {}\n\n cpu = run.tags.get("ecs/cpu", container_context.run_resources.get("cpu"))\n memory = run.tags.get("ecs/memory", container_context.run_resources.get("memory"))\n\n if cpu:\n overrides["cpu"] = cpu\n if memory:\n overrides["memory"] = memory\n\n return overrides\n\n def _get_task_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, Any]:\n tag_overrides = run.tags.get("ecs/task_overrides")\n\n overrides = {}\n\n if tag_overrides:\n overrides = json.loads(tag_overrides)\n\n ephemeral_storage = run.tags.get(\n "ecs/ephemeral_storage", container_context.run_resources.get("ephemeral_storage")\n )\n if ephemeral_storage:\n overrides["ephemeralStorage"] = {"sizeInGiB": int(ephemeral_storage)}\n\n return overrides\n\n def _get_run_task_kwargs_from_run(self, run: DagsterRun) -> Mapping[str, Any]:\n run_task_kwargs = run.tags.get("ecs/run_task_kwargs")\n if run_task_kwargs:\n return json.loads(run_task_kwargs)\n return {}\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _get_current_task_metadata(self):\n if self._current_task_metadata is None:\n self._current_task_metadata = get_current_ecs_task_metadata()\n return self._current_task_metadata\n\n def _get_current_task(self):\n if self._current_task is None:\n current_task_metadata = self._get_current_task_metadata()\n self._current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n\n return self._current_task\n\n def _get_run_task_definition_family(self, run: DagsterRun) -> str:\n return get_task_definition_family("run", check.not_none(run.external_job_origin))\n\n def _get_container_name(self, container_context) -> str:\n return container_context.container_name or self.container_name\n\n def _run_task_kwargs(self, run, image, container_context) -> Dict[str, Any]:\n """Return a dictionary of args to launch the ECS task, registering a new task\n definition if needed.\n """\n environment = self._environment(container_context)\n environment.append({"name": "DAGSTER_RUN_JOB_NAME", "value": run.job_name})\n\n secrets = self._secrets(container_context)\n\n if container_context.task_definition_arn:\n task_definition = container_context.task_definition_arn\n else:\n family = self._get_run_task_definition_family(run)\n\n if self.task_definition_dict or not self.use_current_ecs_task_config:\n runtime_platform = container_context.runtime_platform\n is_windows = container_context.runtime_platform.get(\n "operatingSystemFamily"\n ) not in {None, "LINUX"}\n\n default_resources = (\n DEFAULT_WINDOWS_RESOURCES if is_windows else DEFAULT_LINUX_RESOURCES\n )\n task_definition_config = DagsterEcsTaskDefinitionConfig(\n family,\n image,\n self._get_container_name(container_context),\n command=None,\n log_configuration=(\n {\n "logDriver": "awslogs",\n "options": {\n "awslogs-group": self.task_definition_dict["log_group"],\n "awslogs-region": self.ecs.meta.region_name,\n "awslogs-stream-prefix": family,\n },\n }\n if self.task_definition_dict.get("log_group")\n else None\n ),\n secrets=secrets if secrets else [],\n environment=environment,\n execution_role_arn=container_context.execution_role_arn,\n task_role_arn=container_context.task_role_arn,\n sidecars=container_context.run_sidecar_containers,\n requires_compatibilities=self.task_definition_dict.get(\n "requires_compatibilities", []\n ),\n cpu=container_context.run_resources.get("cpu", default_resources["cpu"]),\n memory=container_context.run_resources.get(\n "memory", default_resources["memory"]\n ),\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n runtime_platform=runtime_platform,\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n repository_credentials=container_context.repository_credentials,\n )\n task_definition_dict = task_definition_config.task_definition_dict()\n else:\n task_definition_dict = get_task_definition_dict_from_current_task(\n self.ecs,\n family,\n self._get_current_task(),\n image,\n self._get_container_name(container_context),\n environment=environment,\n secrets=secrets if secrets else {},\n include_sidecars=self.include_sidecars,\n task_role_arn=container_context.task_role_arn,\n execution_role_arn=container_context.execution_role_arn,\n cpu=container_context.run_resources.get("cpu"),\n memory=container_context.run_resources.get("memory"),\n runtime_platform=container_context.runtime_platform,\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n additional_sidecars=container_context.run_sidecar_containers,\n repository_credentials=container_context.repository_credentials,\n )\n\n task_definition_config = DagsterEcsTaskDefinitionConfig.from_task_definition_dict(\n task_definition_dict,\n self._get_container_name(container_context),\n )\n\n container_name = self._get_container_name(container_context)\n\n backoff(\n self._reuse_or_register_task_definition,\n retry_on=(Exception,),\n kwargs={\n "desired_task_definition_config": task_definition_config,\n "container_name": container_name,\n "task_definition_dict": task_definition_dict,\n },\n max_retries=5,\n )\n\n task_definition = family\n\n if self.use_current_ecs_task_config:\n current_task_metadata = get_current_ecs_task_metadata()\n current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n task_kwargs = get_task_kwargs_from_current_task(\n self.ec2,\n current_task_metadata.cluster,\n current_task,\n )\n else:\n task_kwargs = {}\n\n return {**task_kwargs, **self.run_task_kwargs, "taskDefinition": task_definition}\n\n def _reuse_task_definition(\n self, desired_task_definition_config: DagsterEcsTaskDefinitionConfig, container_name: str\n ):\n family = desired_task_definition_config.family\n\n try:\n existing_task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n except ClientError:\n # task definition does not exist, do not reuse\n return False\n\n return task_definitions_match(\n desired_task_definition_config,\n existing_task_definition,\n container_name=container_name,\n )\n\n def _reuse_or_register_task_definition(\n self,\n desired_task_definition_config: DagsterEcsTaskDefinitionConfig,\n container_name: str,\n task_definition_dict: dict,\n ):\n if not self._reuse_task_definition(desired_task_definition_config, container_name):\n self.ecs.register_task_definition(**task_definition_dict)\n\n def _environment(self, container_context):\n return [\n {"name": key, "value": value}\n for key, value in container_context.get_environment_dict().items()\n ]\n\n def _secrets(self, container_context):\n secrets = container_context.get_secrets_dict(self.secrets_manager)\n return (\n [{"name": key, "valueFrom": value} for key, value in secrets.items()] if secrets else []\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def include_cluster_info_in_failure_messages(self):\n return True\n\n def _is_transient_startup_failure(self, run, task):\n if not task.get("stoppedReason"):\n return False\n return (\n run.status == DagsterRunStatus.STARTING\n and "Timeout waiting for network interface provisioning to complete"\n in task.get("stoppedReason")\n )\n\n def check_run_worker_health(self, run: DagsterRun):\n run_worker_id = run.tags.get(RUN_WORKER_ID_TAG)\n\n tags = self._get_run_tags(run.run_id)\n container_context = EcsContainerContext.create_for_run(run, self)\n\n if not (tags.arn and tags.cluster):\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n t = tasks[0]\n\n if t.get("lastStatus") in RUNNING_STATUSES:\n return CheckRunHealthResult(WorkerStatus.RUNNING, run_worker_id=run_worker_id)\n elif t.get("lastStatus") in STOPPED_STATUSES:\n failed_containers = []\n for c in t.get("containers"):\n if c.get("exitCode") != 0:\n failed_containers.append(c)\n if len(failed_containers) > 0:\n if len(failed_containers) > 1:\n container_str = "Containers"\n else:\n container_str = "Container"\n\n failure_text = []\n\n cluster_failure_info = (\n f"Task {t.get('taskArn')} failed. Stop code: {t.get('stopCode')}. Stop"\n + f" reason: {t.get('stoppedReason')}."\n + f" {container_str} {[c.get('name') for c in failed_containers]} failed."\n )\n\n logging.warning(\n "Run monitoring detected run worker failure: " + cluster_failure_info\n )\n\n if self.include_cluster_info_in_failure_messages:\n failure_text.append(cluster_failure_info)\n\n logs = []\n\n try:\n logs = get_task_logs(\n self.ecs,\n logs_client=self.logs,\n cluster=tags.cluster,\n task_arn=tags.arn,\n container_name=self._get_container_name(container_context),\n )\n except:\n logging.exception(f"Error trying to get logs for failed task {tags.arn}")\n\n if logs:\n failure_text.append("Run worker logs:\\n" + "\\n".join(logs))\n\n return CheckRunHealthResult(\n WorkerStatus.FAILED,\n "\\n\\n".join(failure_text),\n transient=self._is_transient_startup_failure(run, t),\n run_worker_id=run_worker_id,\n )\n\n return CheckRunHealthResult(WorkerStatus.SUCCESS, run_worker_id=run_worker_id)\n\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, "ECS task health status is unknown.", run_worker_id=run_worker_id\n )
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nimport dagster\nimport dagster._check as check\nfrom botocore.exceptions import WaiterError\n\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(f"cluster {cluster_name} not found in region {self.region}")\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns:\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = f"{log_key_prefix}{cluster_id}/steps/{step_id}"\n stdout_log = self.wait_for_log(log, log_bucket, f"{prefix}/stdout.gz")\n stderr_log = self.wait_for_log(log, log_bucket, f"{prefix}/stderr.gz")\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(f"Attempting to get log: s3://{log_bucket}/{log_key}")\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._serdes import deserialize_value\n\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description=(\n "The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html"\n ),\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description=(\n "S3 bucket to use for passing files between the plan process and EMR process."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description=(\n "S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process"\n ),\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime."\n ),\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to the package that contains the job definition(s) whose steps will"\n " execute remotely on EMR. This is a path on the local fileystem of the process"\n " executing the job. The expectation is that this package will also be available on"\n " the python path of the launched process running the Spark step on EMR, either"\n " deployed on step launch via the deploy_local_job_package option, referenced on s3"\n " via the s3_job_package_path option, or installed on the cluster via bootstrap"\n " actions."\n ),\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "(legacy) Absolute path to the package that contains the pipeline definition(s)"\n " whose steps will execute remotely on EMR. This is a path on the local fileystem"\n " of the process executing the pipeline. The expectation is that this package will"\n " also be available on the python path of the launched process running the Spark"\n " step on EMR, either deployed on step launch via the deploy_local_pipeline_package"\n " option, referenced on s3 via the s3_pipeline_package_path option, or installed on"\n " the cluster via bootstrap actions."\n ),\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "(legacy) If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True."\n ),\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True."\n ),\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_pipeline_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items() # type: ignore\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(f"Uploading file {local_path} to {s3_uri}")\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context):\n step_run_ref = step_context_to_step_run_ref(step_context, self.local_job_package_path)\n\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.op.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(step_context, s3, run_id, step_key, emr_step_id):\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n\n def wait_for_completion(\n self, step_context, s3, run_id, step_key, emr_step_id, check_interval=15\n ):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(\n step_context.log, self.cluster_id, emr_step_id\n )\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n event = all_events_new[i]\n # write each event from the EMR instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object(\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return deserialize_value(pickle.loads(events_data))\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log.\n """\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc=(\n "spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"'\n )\n % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, self._sanitize_step_key(step_key), filename)\n return f"s3://{self.staging_bucket}/{key}"\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n """Cluster state for EMR."""\n\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n """Step state for EMR."""\n\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\nfrom logging import Logger\nfrom typing import Any, Dict, Optional, cast\n\nimport psycopg2\nimport psycopg2.extensions\nfrom dagster import (\n    ConfigurableResource,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass BaseRedshiftClient(abc.ABC):\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftClient(BaseRedshiftClient):\n    def __init__(self, conn_args: Dict[str, Any], autocommit: Optional[bool], log: Logger):\n        # Extract parameters from resource config\n        self.conn_args = conn_args\n\n        self.autocommit = autocommit\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info(f"Executing query '{query}'")\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info(f"Executing query '{query}'")\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use RedshiftClientResource instead.")\nclass RedshiftResource(RedshiftClient):\n    """This class was used by the function-style Redshift resource."""\n\n\nclass FakeRedshiftClient(BaseRedshiftClient):\n    QUERY_RESULT = [(1,)]\n\n    def __init__(self, log: Logger):\n        # Extract parameters from resource config\n\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3.\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use FakeRedshiftClientResource instead.")\nclass FakeRedshiftResource(FakeRedshiftClient):\n    """This class was used by the function-style fake Redshift resource."""\n\n\n
[docs]class RedshiftClientResource(ConfigurableResource):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import Definitions, asset, EnvVar\n from dagster_aws.redshift import RedshiftClientResource\n\n @asset\n def example_redshift_asset(context, redshift: RedshiftClientResource):\n redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = RedshiftClientResource(\n host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n port=5439,\n user='dagster',\n password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n database='dev',\n )\n\n defs = Definitions(\n assets=[example_redshift_asset],\n resources={'redshift': redshift_configured},\n )\n\n """\n\n host: str = Field(description="Redshift host")\n port: int = Field(default=5439, description="Redshift port")\n user: Optional[str] = Field(default=None, description="Username for Redshift connection")\n password: Optional[str] = Field(default=None, description="Password for Redshift connection")\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use USE DATABASE to change"\n " the database."\n ),\n )\n autocommit: Optional[bool] = Field(default=None, description="Whether to autocommit queries")\n connect_timeout: int = Field(\n default=5, description="Timeout for connection to Redshift cluster. Defaults to 5 seconds."\n )\n sslmode: str = Field(\n default="require",\n description=(\n "SSL mode to use. See the Redshift documentation for reference:"\n " https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> RedshiftClient:\n conn_args = {\n k: getattr(self, k, None)\n for k in (\n "host",\n "port",\n "user",\n "password",\n "database",\n "connect_timeout",\n "sslmode",\n )\n if getattr(self, k, None) is not None\n }\n\n return RedshiftClient(conn_args, self.autocommit, get_dagster_logger())
\n\n\n
[docs]class FakeRedshiftClientResource(RedshiftClientResource):\n def get_client(self) -> FakeRedshiftClient:\n return FakeRedshiftClient(get_dagster_logger())
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=RedshiftClientResource.to_config_schema(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context) -> RedshiftClient:\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftClientResource.from_resource_context(context).get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=FakeRedshiftClientResource.to_config_schema(),\n description=(\n "Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case."\n ),\n)\ndef fake_redshift_resource(context) -> FakeRedshiftClient:\n return cast(\n FakeRedshiftClient,\n FakeRedshiftClientResource.from_resource_context(context).get_client(),\n )
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence\n\nimport boto3\nimport dagster._seven as seven\nfrom botocore.errorfactory import ClientError\nfrom dagster import (\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nPOLLING_INTERVAL = 5\n\n\n
[docs]class S3ComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n upload_interval: 30\n upload_extra_args:\n ServerSideEncryption: "AES256"\n show_url_only: false\n region: "us-west-1"\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.\n upload_extra_args: (Optional[dict]): Extra args for S3 file upload\n show_url_only: (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.\n region: (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n upload_interval=None,\n upload_extra_args=None,\n show_url_only=False,\n region=None,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n check.opt_dict_param(upload_extra_args, "upload_extra_args")\n self._upload_extra_args = upload_extra_args\n self._show_url_only = show_url_only\n if region is None:\n # if unspecified, use the current session name\n self._region = self._s3_session.meta.region_name\n else:\n self._region = region\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n "upload_extra_args": Field(\n Permissive(), is_required=False, description="Extra args for S3 file upload"\n ),\n "show_url_only": Field(bool, is_required=False, default_value=False),\n "region": Field(StringSource, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _s3_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._s3_prefix, "storage", *namespace, filename]\n return "/".join(paths) # s3 path delimiter\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Iterator[CapturedLogContext]:\n with super().capture_logs(log_key) as local_context:\n if not self._show_url_only:\n yield local_context\n else:\n out_key = self._s3_key(log_key, ComputeIOType.STDOUT)\n err_key = self._s3_key(log_key, ComputeIOType.STDERR)\n s3_base = f"https://s3.console.aws.amazon.com/s3/object/{self._s3_bucket}?region={self._region}"\n yield CapturedLogContext(\n local_context.log_key,\n external_stdout_url=f"{s3_base}&prefix={out_key}",\n external_stderr_url=f"{s3_base}&prefix={err_key}",\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n\n s3_keys_to_remove = None\n if log_key:\n s3_keys_to_remove = [\n self._s3_key(log_key, ComputeIOType.STDOUT),\n self._s3_key(log_key, ComputeIOType.STDERR),\n self._s3_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._s3_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n elif prefix:\n # add the trailing '' to make sure that ['a'] does not match ['apple']\n s3_prefix = "/".join([self._s3_prefix, "storage", *prefix, ""])\n matching = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=s3_prefix)\n s3_keys_to_remove = [obj["Key"] for obj in matching.get("Contents", [])]\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if s3_keys_to_remove:\n to_delete = [{"Key": key} for key in s3_keys_to_remove]\n self._s3_session.delete_objects(Bucket=self._s3_bucket, Delete={"Objects": to_delete})\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n s3_key = self._s3_key(log_key, io_type)\n return self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": s3_key}\n )\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n s3_key = self._s3_key(log_key, io_type)\n return f"s3://{self._s3_bucket}/{s3_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(Bucket=self._s3_bucket, Key=s3_key)\n except ClientError:\n return False\n return True\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if (self._skip_empty_files or partial) and os.stat(path).st_size == 0:\n return\n\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n extra_args = {\n "ContentType": "text/plain",\n **(self._upload_extra_args if self._upload_extra_args else {}),\n }\n self._s3_session.upload_fileobj(data, self._s3_bucket, s3_key, ExtraArgs=extra_args)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self._local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(self._s3_bucket, s3_key, fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return f"s3://{self.s3_bucket}/{self.s3_key}"
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return f"{self._s3_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\nfrom typing import Any, Dict, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    MetadataValue,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import S3Resource\n\n\nclass PickledObjectS3IOManager(UPathIOManager):\n    def __init__(\n        self,\n        s3_bucket: str,\n        s3_session: Any,\n        s3_prefix: Optional[str] = None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        check.opt_str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.list_objects(Bucket=s3_bucket, Prefix=s3_prefix, MaxKeys=1)\n        base_path = UPath(s3_prefix) if s3_prefix else None\n        super().__init__(base_path=base_path)\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        try:\n            s3_obj = self.s3.get_object(Bucket=self.bucket, Key=str(path))["Body"].read()\n            return pickle.loads(s3_obj)\n        except self.s3.exceptions.NoSuchKey:\n            raise FileNotFoundError(f"Could not find file {path} in S3 bucket {self.bucket}")\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing S3 object: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, str(path))\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=str(path))\n        except self.s3.exceptions.NoSuchKey:\n            return False\n        return True\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading S3 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing S3 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        self.s3.delete_object(Bucket=self.bucket, Key=str(path))\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in S3\n        return None\n\n    def get_metadata(self, context: OutputContext, obj: Any) -> Dict[str, MetadataValue]:\n        path = self._get_path(context)\n        return {"uri": MetadataValue.path(self._uri_for_path(path))}\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        return UPath("storage", super().get_op_output_relative_path(context))\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"s3://{self.bucket}/{path}"\n\n\n
[docs]class S3PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": S3PickleIOManager(\n s3_resource=S3Resource(),\n s3_bucket="my-cool-bucket",\n s3_prefix="my-cool-prefix",\n )\n }\n )\n\n """\n\n s3_resource: ResourceDependency[S3Resource]\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @cached_method\n def inner_io_manager(self) -> PickledObjectS3IOManager:\n return PickledObjectS3IOManager(\n s3_bucket=self.s3_bucket,\n s3_session=self.s3_resource.get_client(),\n s3_prefix=self.s3_prefix,\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self.inner_io_manager().load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n return self.inner_io_manager().handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use S3PickleIOManager instead.",\n)\nclass ConfigurablePickledObjectS3IOManager(S3PickleIOManager):\n """Renamed to S3PickleIOManager. See S3PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=S3PickleIOManager.to_config_schema(),\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n @job(\n resource_defs={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n def my_job():\n ...\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.ops

\nfrom typing import Any, Generator, Mapping\n\nfrom dagster import (\n    AssetMaterialization,\n    Field,\n    FileHandle,\n    In,\n    MetadataValue,\n    Out,\n    Output,\n    StringSource,\n    _check as check,\n    dagster_type_loader,\n    op,\n)\nfrom dagster._core.types.dagster_type import PythonObjectDagsterType\n\nfrom .file_manager import S3FileHandle\n\n\ndef dict_with_fields(name: str, fields: Mapping[str, object]):\n    check.str_param(name, "name")\n    check.mapping_param(fields, "fields", key_type=str)\n    field_names = set(fields.keys())\n\n    @dagster_type_loader(fields)\n    def _input_schema(_context, value):\n        check.dict_param(value, "value")\n        check.param_invariant(set(value.keys()) == field_names, "value")\n        return value\n\n    class _DictWithSchema(PythonObjectDagsterType):\n        def __init__(self):\n            super(_DictWithSchema, self).__init__(python_type=dict, name=name, loader=_input_schema)\n\n    return _DictWithSchema()\n\n\nS3Coordinate = dict_with_fields(\n    "S3Coordinate",\n    fields={\n        "bucket": Field(StringSource, description="S3 bucket name"),\n        "key": Field(StringSource, description="S3 key name"),\n    },\n)\n\n\ndef last_key(key: str) -> str:\n    if "/" not in key:\n        return key\n    comps = key.split("/")\n    return comps[-1]\n\n\n@op(\n    config_schema={\n        "Bucket": Field(\n            StringSource, description="The name of the bucket to upload to.", is_required=True\n        ),\n        "Key": Field(\n            StringSource, description="The name of the key to upload to.", is_required=True\n        ),\n    },\n    ins={"file_handle": In(FileHandle, description="The file to upload.")},\n    out={"s3_file_handle": Out(S3FileHandle)},\n    description="""Take a file handle and upload it to s3. Returns an S3FileHandle.""",\n    required_resource_keys={"s3", "file_manager"},\n)\ndef file_handle_to_s3(context, file_handle) -> Generator[Any, None, None]:\n    bucket = context.op_config["Bucket"]\n    key = context.op_config["Key"]\n\n    file_manager = context.resources.file_manager\n    s3 = context.resources.s3\n\n    with file_manager.read(file_handle, "rb") as fileobj:\n        s3.upload_fileobj(fileobj, bucket, key)\n        s3_file_handle = S3FileHandle(bucket, key)\n\n        yield AssetMaterialization(\n            asset_key=s3_file_handle.s3_path,\n            metadata={last_key(key): MetadataValue.path(s3_file_handle.s3_path)},\n        )\n\n        yield Output(value=s3_file_handle, output_name="s3_file_handle")\n
", "current_page_name": "_modules/dagster_aws/s3/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.resources

\nfrom typing import Any, Optional, TypeVar\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nT = TypeVar("T")\n\n\nclass ResourceWithS3Configuration(ConfigurableResource):\n    use_unsigned_session: bool = Field(\n        default=False, description="Specifies whether to use an unsigned S3 session."\n    )\n    region_name: Optional[str] = Field(\n        default=None, description="Specifies a custom region for the S3 session."\n    )\n    endpoint_url: Optional[str] = Field(\n        default=None, description="Specifies a custom endpoint for the S3 session."\n    )\n    max_attempts: int = Field(\n        default=5,\n        description=(\n            "This provides Boto3's retry handler with a value of maximum retry attempts, where the"\n            " initial call counts toward the max_attempts value that you provide."\n        ),\n    )\n    profile_name: Optional[str] = Field(\n        default=None, description="Specifies a profile to connect that session."\n    )\n    use_ssl: bool = Field(\n        default=True, description="Whether or not to use SSL. By default, SSL is used."\n    )\n    verify: Optional[str] = Field(\n        default=None,\n        description=(\n            "Whether or not to verify SSL certificates. By default SSL certificates are verified."\n            " You can also specify this argument if you want to use a different CA cert bundle than"\n            " the one used by botocore."\n        ),\n    )\n    aws_access_key_id: Optional[str] = Field(\n        default=None, description="AWS access key ID to use when creating the boto3 session."\n    )\n    aws_secret_access_key: Optional[str] = Field(\n        default=None, description="AWS secret access key to use when creating the boto3 session."\n    )\n    aws_session_token: str = Field(\n        default=None, description="AWS session token to use when creating the boto3 session."\n    )\n\n\n
[docs]class S3Resource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op, Definitions\n from dagster_aws.s3 import S3Resource\n\n @op\n def example_s3_op(s3: S3Resource):\n return s3.get_client().list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job\n def example_job():\n example_s3_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={'s3': S3Resource(region_name='us-west-1')}\n )\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> Any:\n return construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=S3Resource.to_config_schema())\ndef s3_resource(context) -> Any:\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job():\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n use_ssl: true\n # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n verify: None\n # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n aws_access_key_id: None\n # Optional[str]: The access key to use when creating the client.\n aws_secret_access_key: None\n # Optional[str]: The secret key to use when creating the client.\n aws_session_token: None\n # Optional[str]: The session token to use when creating the client.\n """\n return S3Resource.from_resource_context(context).get_client()
\n\n\n
[docs]class S3FileManagerResource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n def get_client(self) -> S3FileManager:\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n ),\n s3_bucket=self.s3_bucket,\n s3_base_key=self.s3_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=S3FileManagerResource.to_config_schema(),\n)\ndef s3_file_manager(context) -> S3FileManager:\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return S3FileManagerResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Dict, Generator, List, Optional, cast\n\nfrom dagster import (\n    Field as LegacyDagsterField,\n    resource,\n)\nfrom dagster._config.field_utils import Shape\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.test_utils import environ\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom dagster_aws.utils import ResourceWithBoto3Configuration\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nif TYPE_CHECKING:\n    import botocore\n\n\n
[docs]class SecretsManagerResource(ResourceWithBoto3Configuration):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import SecretsManagerResource\n\n @op\n def example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n return secretsmanager.get_client().get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job\n def example_job():\n example_secretsmanager_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secretsmanager': SecretsManagerResource(\n region_name='us-west-1'\n )\n }\n )\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> "botocore.client.SecretsManager":\n return construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(SecretsManagerResource.to_config_schema())\ndef secretsmanager_resource(context) -> "botocore.client.SecretsManager":\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job():\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return SecretsManagerResource.from_resource_context(context).get_client()
\n\n\n
[docs]class SecretsManagerSecretsResource(ResourceWithBoto3Configuration):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op, ResourceParam\n from dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n @op\n def example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n return secrets.fetch_secrets().get("my-secret-name")\n\n @op\n def example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n with secrets.secrets_in_environment():\n return os.getenv("my-other-secret-name")\n\n @job\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secrets': SecretsManagerSecretsResource(\n region_name='us-west-1',\n secrets_tag="dagster",\n add_to_environment=True,\n )\n }\n )\n\n Note that your ops must also declare that they require this resource with or it will not be initialized\n for the execution of their compute functions.\n """\n\n secrets: List[str] = Field(\n default=[], description="An array of AWS Secrets Manager secrets arns to fetch."\n )\n secrets_tag: Optional[str] = Field(\n default=None,\n description="AWS Secrets Manager secrets with this tag will be fetched and made available.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def secrets_in_environment(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Generator[Dict[str, str], None, None]:\n """Yields a dict which maps selected SecretsManager secrets to their string values. Also\n sets chosen secrets as environment variables.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n secrets_manager = construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )\n\n secrets_tag_to_fetch = secrets_tag if secrets_tag is not None else self.secrets_tag\n secrets_to_fetch = secrets if secrets is not None else self.secrets\n\n secret_arns = merge_dicts(\n (\n get_tagged_secrets(secrets_manager, [secrets_tag_to_fetch])\n if secrets_tag_to_fetch\n else {}\n ),\n get_secrets_from_arns(secrets_manager, secrets_to_fetch),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map):\n yield secrets_map\n\n def fetch_secrets(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Dict[str, str]:\n """Fetches secrets from AWS Secrets Manager and returns them as a dict.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n with self.secrets_in_environment(secrets=secrets, secrets_tag=secrets_tag) as secret_values:\n return secret_values
\n\n\nLEGACY_SECRETSMANAGER_SECRETS_SCHEMA = {\n **cast(Shape, SecretsManagerSecretsResource.to_config_schema().as_field().config_type).fields,\n "add_to_environment": LegacyDagsterField(\n bool,\n default_value=False,\n description="Whether to add the secrets to the environment. Defaults to False.",\n ),\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=LEGACY_SECRETSMANAGER_SECRETS_SCHEMA)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = context.resource_config.get("add_to_environment", False)\n if add_to_environment:\n with SecretsManagerSecretsResource.from_resource_context(\n context\n ).secrets_in_environment() as secrets:\n yield secrets\n else:\n yield SecretsManagerSecretsResource.from_resource_context(context).fetch_secrets()
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom typing import Any, Dict, Optional\nfrom unittest import mock\n\nfrom dagster import resource\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom .utils import ResourceNotFoundError\n\n\n@dagster_maintained_resource\n@resource({"account_name": str})\ndef fake_adls2_resource(context):\n    return FakeADLS2Resource(account_name=context.resource_config["account_name"])\n\n\n
[docs]class FakeADLS2Resource(ConfigurableResource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n account_name: str\n storage_account: Optional[str] = None\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def adls2_client(self) -> "FakeADLS2ServiceClient":\n return FakeADLS2ServiceClient(self.account_name)\n\n @property\n @cached_method\n def blob_client(self) -> FakeBlobServiceClient:\n return FakeBlobServiceClient(self.account_name)\n\n @property\n def lease_client_constructor(self) -> Any:\n return FakeLeaseClient
\n\n\nclass FakeLeaseClient:\n def __init__(self, client):\n self.client = client\n self.id = None\n\n # client needs a ref to self to check if a given lease is valid\n self.client._lease = self # noqa: SLF001\n\n def acquire(self, lease_duration=-1):\n if self.id is None:\n self.id = random.randint(0, 2**9)\n else:\n raise Exception("Lease already held")\n\n def release(self):\n self.id = None\n\n def is_valid(self, lease):\n if self.id is None:\n # no lease is held so any operation is valid\n return True\n return lease == self.id\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system: Dict[str, FakeADLS2FileClient] = {}\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n # pass fileclient a ref to self and its name so the file can delete itself\n self._file_system.setdefault(file_path, FakeADLS2FileClient(self, file_path))\n return self._file_system[file_path]\n\n def create_file(self, file):\n # pass fileclient a ref to self and the file's name so the file can delete itself by\n # accessing the self._file_system dict\n self._file_system.setdefault(file, FakeADLS2FileClient(fs_client=self, name=file))\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self, name, fs_client):\n self.name = name\n self.contents = None\n self._lease = None\n self.fs_client = fs_client\n\n @property\n def lease(self):\n return self._lease if self._lease is None else self._lease.id\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n lease_id = None if self._lease is None else self._lease.id\n return {"lease": lease_id}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n def delete_file(self, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n self.fs_client.delete_file(self.name)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return f"adfss://{self.file_system}@{self.account}.dfs.core.windows.net/{self.key}"
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if "b" in mode else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return f"{self._prefix}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Union\n\nfrom dagster import (\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.pythonic_config import ConfigurableIOManager\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom dagster_azure.adls2.resources import ADLS2Resource\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(UPathIOManager):\n    def __init__(\n        self,\n        file_system: Any,\n        adls2_client: Any,\n        blob_client: Any,\n        lease_client_constructor: Any,\n        prefix: str = "dagster",\n    ):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_client_constructor = lease_client_constructor\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n        super().__init__(base_path=UPath(self.prefix))\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading ADLS2 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing ADLS2 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        file_client = self.file_system_client.get_file_client(str(path))\n        with self._acquire_lease(file_client, is_rm=True) as lease:\n            file_client.delete_file(lease=lease, recursive=True)\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in ADLS2\n        return None\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.file_system_client.get_file_client(str(path)).get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str:\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=path,\n        )\n\n    @contextmanager\n    def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]:\n        lease_client = self.lease_client_constructor(client=client)\n        try:\n            lease_client.acquire(lease_duration=self.lease_duration)\n            yield lease_client.id\n        finally:\n            # cannot release a lease on a file that no longer exists, so need to check\n            if not is_rm:\n                lease_client.release()\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        if context.dagster_type.typing_type == type(None):\n            return None\n        file = self.file_system_client.get_file_client(str(path))\n        stream = file.download_file()\n        return pickle.loads(stream.readall())\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing ADLS2 key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        file = self.file_system_client.create_file(str(path))\n        with self._acquire_lease(file) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]class ADLS2PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n\n adls2: ResourceDependency[ADLS2Resource]\n adls2_file_system: str = Field(description="ADLS Gen2 file system name.")\n adls2_prefix: str = Field(\n default="dagster", description="ADLS Gen2 file system prefix to write to."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectADLS2IOManager:\n return PickledObjectADLS2IOManager(\n self.adls2_file_system,\n self.adls2.adls2_client,\n self.adls2.blob_client,\n self.adls2.lease_client_constructor,\n self.adls2_prefix,\n )\n\n def load_input(self, context: "InputContext") -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectADLS2IOManager(ADLS2PickleIOManager):\n """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=ADLS2PickleIOManager.to_config_schema(),\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.resources

\nfrom typing import Any, Dict, Union\n\nfrom azure.identity import DefaultAzureCredential\nfrom azure.storage.filedatalake import DataLakeLeaseClient\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    Field as DagsterField,\n    Permissive,\n    Selector,\n    StringSource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\nfrom typing_extensions import Literal\n\nfrom dagster_azure.blob.utils import BlobServiceClient, create_blob_client\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import DataLakeServiceClient, create_adls2_client\n\n\nclass ADLS2SASToken(Config):\n    credential_type: Literal["sas"] = "sas"\n    token: str\n\n\nclass ADLS2Key(Config):\n    credential_type: Literal["key"] = "key"\n    key: str\n\n\nclass ADLS2DefaultAzureCredential(Config):\n    credential_type: Literal["default_azure_credential"] = "default_azure_credential"\n    kwargs: Dict[str, Any]\n\n\nclass ADLS2BaseResource(ConfigurableResource):\n    storage_account: str = Field(description="The storage account name.")\n    credential: Union[ADLS2SASToken, ADLS2Key, ADLS2DefaultAzureCredential] = Field(\n        discriminator="credential_type", description="The credentials with which to authenticate."\n    )\n\n\nDEFAULT_AZURE_CREDENTIAL_CONFIG = DagsterField(\n    Permissive(\n        description="Uses DefaultAzureCredential to authenticate and passed as keyword arguments",\n    )\n)\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": DagsterField(StringSource, description="The storage account name."),\n    "credential": DagsterField(\n        Selector(\n            {\n                "sas": DagsterField(StringSource, description="SAS token for the account."),\n                "key": DagsterField(StringSource, description="Shared Access Key for the account."),\n                "DefaultAzureCredential": DEFAULT_AZURE_CREDENTIAL_CONFIG,\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]class ADLS2Resource(ADLS2BaseResource):\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _raw_credential(self) -> Any:\n if isinstance(self.credential, ADLS2Key):\n return self.credential.key\n elif isinstance(self.credential, ADLS2SASToken):\n return self.credential.token\n else:\n return DefaultAzureCredential(**self.credential.kwargs)\n\n @property\n @cached_method\n def adls2_client(self) -> DataLakeServiceClient:\n return create_adls2_client(self.storage_account, self._raw_credential)\n\n @property\n @cached_method\n def blob_client(self) -> BlobServiceClient:\n return create_blob_client(self.storage_account, self._raw_credential)\n\n @property\n def lease_client_constructor(self) -> Any:\n return DataLakeLeaseClient
\n\n\n# Due to a limitation of the discriminated union type, we can't directly mirror these old\n# config fields in the new resource config. Instead, we'll just use the old config fields\n# to construct the new config and then use that to construct the resource.\n
[docs]@dagster_maintained_resource\n@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token, a key or by passing the\n `DefaultAzureCredential` object.\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n DefaultAzureCredential: {}\n # dict: The keyword arguments used for DefaultAzureCredential\n # or leave the object empty for no arguments\n DefaultAzureCredential:\n exclude_environment_credential: true\n\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": DagsterField(\n StringSource, description="ADLS Gen2 file system name"\n ),\n "adls2_prefix": DagsterField(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\ndef _adls2_resource_from_config(config) -> ADLS2Resource:\n """Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n if "DefaultAzureCredential" in config["credential"]:\n credential = ADLS2DefaultAzureCredential(\n kwargs=config["credential"]["DefaultAzureCredential"]\n )\n elif "sas" in config["credential"]:\n credential = ADLS2SASToken(token=config["credential"]["sas"])\n else:\n credential = ADLS2Key(key=config["credential"]["key"])\n\n return ADLS2Resource(storage_account=storage_account, credential=credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom azure.identity import DefaultAzureCredential\nfrom dagster import (\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n default_azure_credential:\n exclude_environment_credential: true\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n upload_interval: 30\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (Optional[str]): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n default_azure_credential (Optional[dict]): Use and configure DefaultAzureCredential.\n Cannot be used with sas token or secret key config.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key=None,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n upload_interval=None,\n default_azure_credential=None,\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n self._default_azure_credential = check.opt_dict_param(\n default_azure_credential, "default_azure_credential"\n )\n check.opt_str_param(secret_key, "secret_key")\n check.invariant(\n secret_key is not None or default_azure_credential is not None,\n "Missing config: need to provide one of secret_key or default_azure_credential",\n )\n\n if default_azure_credential is None:\n self._blob_client = create_blob_client(storage_account, secret_key)\n else:\n credential = DefaultAzureCredential(**self._default_azure_credential)\n self._blob_client = create_blob_client(storage_account, credential)\n\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, dagster_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs(dagster_run, step_key): # noqa: SLF001\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": Field(StringSource, is_required=False),\n "default_azure_credential": Field(\n Noneable(Permissive(description="keyword arguments for DefaultAzureCredential")),\n is_required=False,\n default_value=None,\n ),\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _blob_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._blob_prefix, "storage", *namespace, filename]\n return "/".join(paths) # blob path delimiter\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n if log_key:\n prefix_path = "/".join([self._blob_prefix, "storage", *log_key])\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n prefix_path = "/".join([self._blob_prefix, "storage", *prefix, ""])\n else:\n prefix_path = None\n\n blob_list = {\n b.name for b in list(self._container_client.list_blobs(name_starts_with=prefix_path))\n }\n\n to_remove = None\n if log_key:\n # filter to the known set of keys\n known_keys = [\n self._blob_key(log_key, ComputeIOType.STDOUT),\n self._blob_key(log_key, ComputeIOType.STDERR),\n self._blob_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._blob_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n to_remove = [key for key in known_keys if key in blob_list]\n elif prefix:\n to_remove = list(blob_list)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if to_remove:\n self._container_client.delete_blobs(*to_remove)\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n blob_key = self._blob_key(log_key, io_type)\n if blob_key in self._download_urls:\n return self._download_urls[blob_key]\n blob = self._container_client.get_blob_client(blob_key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n blob_key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[blob_key] = url\n return url\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n blob_key = self._blob_key(log_key, io_type)\n return f"https://{self._storage_account}.blob.core.windows.net/{self._container}/{blob_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n blob_objects = self._container_client.list_blobs(blob_key)\n exact_matches = [blob for blob in blob_objects if blob.name == blob_key]\n return len(exact_matches) > 0\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(blob_key)\n blob.upload_blob(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(blob_key)\n blob.download_blob().readinto(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery.executor

\nfrom dagster import (\n    Executor,\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True, # Not actually checked by the celery task\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_job.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_plan",\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_docker.executor

\nimport os\n\nimport docker.client\nfrom dagster import (\n    DagsterInstance,\n    Executor,\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description=(\n                        "The list of environment variables names to forward from the celery worker"\n                        " in to the docker container"\n                    ),\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description=(\n                        "Name of the network this container will be connected to at creation time"\n                    ),\n                ),\n                "container_kwargs": Field(\n                    Permissive(),\n                    is_required=False,\n                    description="Additional keyword args for the docker container",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_docker_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n container_kwargs: # keyword args to be passed to the container. example:\n volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_docker",\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n ),\n as_type=ExecuteStepArgs,\n )\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else dagster_run.job_code_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n f"Executing steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(\n {\n "Step keys": step_keys_str,\n "Image": docker_image,\n "Celery worker": self.request.hostname,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_value(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n container_kwargs = check.opt_dict_param(\n docker_config.get("container_kwargs"), "container_kwargs", key_type=str\n )\n\n # set defaults for detach and auto_remove\n container_kwargs["detach"] = container_kwargs.get("detach", False)\n container_kwargs["auto_remove"] = container_kwargs.get("auto_remove", True)\n\n # if environment variables are provided via container_kwargs, merge with env_vars\n if container_kwargs.get("environment") is not None:\n e_vars = container_kwargs.get("environment")\n if isinstance(e_vars, dict):\n docker_env.update(e_vars)\n else:\n for v in e_vars:\n key, val = v.split("=")\n docker_env[key] = val\n del container_kwargs["environment"]\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=execute_step_args.get_command_args(),\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n **container_kwargs,\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n metadata = {"Job image": docker_image}\n if err.stderr is not None:\n metadata["Docker stderr"] = err.stderr\n\n instance.report_engine_event(\n f"Failed to run steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(metadata),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n events = filter_dagster_events_from_cli_logs(res.split("\\n"))\n serialized_events += [serialize_value(event) for event in events]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    DagsterInstance,\n    Executor,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sJobStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n    DagsterKubernetesClient,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace", run_launcher.job_namespace),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n job_origin = plan_context.reconstructable_job.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n job_origin=job_origin,\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n print_serialized_events=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(job_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_k8s_job",\n )\n\n\ndef construct_step_failure_event_and_handle(dagster_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n job_name=dagster_run.job_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n job_name=dagster_run.job_name,\n run_id=dagster_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n api_client = DagsterKubernetesClient.production_client()\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n f"Task for step {step_key} picked up by Celery",\n dagster_run,\n EngineEventData(\n {\n "Celery worker name": celery_worker_name,\n "Celery worker Kubernetes Pod name": celery_pod_name,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if dagster_run.status != DagsterRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n labels = {\n "dagster/job": dagster_run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.run_id,\n }\n if dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": dagster_run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n ],\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Job image": job_config.job_image,\n "Image pull policy": job_config.image_pull_policy,\n "Image pull secrets": str(job_config.image_pull_secrets),\n "Service account name": str(job_config.service_account_name),\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n api_client.wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n dagster_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sJobStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Kubernetes Job namespace": job_namespace,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n api_client.delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ):\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = api_client.get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n dagster_run,\n EngineEventData({"Pod names": "\\n".join(pod_names)}),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = api_client.retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.exceptions.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_cli_logs(logs)\n serialized_events = [serialize_value(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\nfrom typing import Optional, cast\n\nimport kubernetes\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom dagster._config import process_config, resolve_to_config_type\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.launcher import LaunchRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_k8s.client import DagsterKubernetesClient\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data: Optional[ConfigurableClassData] = None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n job_namespace=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n super().__init__()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n job_origin = cast(JobPythonOrigin, context.job_code_origin)\n repository_origin = job_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your"\n f" executor configuration, but also {job_image} in your user-code"\n f" deployment. Using the job image {job_image_from_executor_config} from"\n " executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. To resolve"\n " this error, specify the job_image configuration in the executor config"\n " section in your run config. \\nNote: You may also be seeing this error because"\n " you are using the configured API. Using configured with the celery-k8s"\n " executor is not supported at this time, and the job_image must be configured"\n " at the top-level executor config without using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(run.tags)\n\n from dagster._cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[{"name": "DAGSTER_RUN_JOB_NAME", "value": job_origin.job_name}],\n )\n\n job_namespace = exc_config.get("job_namespace", self.job_namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=job_namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; delete_job returned {}"\n .format(termination_result)\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; encountered error in delete_job"\n ),\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n dagster_run = self._instance.get_run_by_id(run_id)\n run_config = dagster_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace", self.job_namespace)\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace", self.job_namespace\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n status = self._api_client.get_job_status(namespace=job_namespace, job_name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if CELERY_K8S_CONFIG_KEY not in executor_config:\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured."\n .format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_census": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import CensusOutput\nfrom .utils import generate_materialization\n\n\n
[docs]@op(\n required_resource_keys={"census"},\n ins={"start_after": In(Nothing)},\n out=Out(\n CensusOutput,\n description=(\n "Parsed json dictionary representing the details of the Census sync after "\n "the sync successfully completes."\n ),\n ),\n config_schema={\n "sync_id": Field(\n int,\n is_required=True,\n description="Id of the parent sync.",\n ),\n "force_full_sync": Field(\n config=Bool,\n default_value=False,\n description=(\n "If this trigger request should be a Full Sync. "\n "Note that some sync configurations such as Append do not support full syncs."\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) to wait between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description=(\n "The maximum time to wait before this operation is timed out. By "\n "default, this will never time out."\n ),\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Census sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["census"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "census"},\n)\ndef census_trigger_sync_op(context):\n """Executes a Census sync for a given ``sync_id`` and polls until that sync completes, raising\n an error if it is unsuccessful.\n\n It outputs a :py:class:`~dagster_census.CensusOutput` which contains the details of the Census\n sync after it successfully completes.\n\n It requires the use of the :py:class:`~dagster_census.census_resource`, which allows it to\n communicate with the Census API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource, census_sync_op\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n sync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"census": my_census_resource})\n def my_simple_census_job():\n sync_foobar()\n\n """\n census_output = context.resources.census.trigger_sync_and_poll(\n sync_id=context.op_config["sync_id"],\n force_full_sync=context.op_config["force_full_sync"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield generate_materialization(\n census_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(census_output)
\n
", "current_page_name": "_modules/dagster_census/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional\n\nimport requests\nfrom dagster import Failure, Field, StringSource, __version__, get_dagster_logger, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom .types import CensusOutput\n\nCENSUS_API_BASE = "app.getcensus.com/api"\nCENSUS_VERSION = "v1"\n\nDEFAULT_POLL_INTERVAL = 10\n\nSYNC_RUN_STATUSES = {"completed", "failed", "queued", "skipped", "working"}\n\n\n
[docs]class CensusResource:\n """This class exposes methods on top of the Census REST API."""\n\n def __init__(\n self,\n api_key: str,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self.api_key = api_key\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def _api_key(self):\n if self.api_key.startswith("secret-token:"):\n return self.api_key\n return "secret-token:" + self.api_key\n\n @property\n def api_base_url(self) -> str:\n return f"https://{CENSUS_API_BASE}/{CENSUS_VERSION}"\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Census API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Census API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n url = f"{self.api_base_url}/{endpoint}"\n headers = {\n "User-Agent": f"dagster-census/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=HTTPBasicAuth("bearer", self._api_key),\n data=data,\n )\n response.raise_for_status()\n return response.json()\n except RequestException as e:\n self._log.error("Request to Census API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n def get_sync(self, sync_id: int) -> Mapping[str, Any]:\n """Gets details about a given sync from the Census API.\n\n Args:\n sync_id (int): The Census Sync ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"syncs/{sync_id}")\n\n def get_source(self, source_id: int) -> Mapping[str, Any]:\n """Gets details about a given source from the Census API.\n\n Args:\n source_id (int): The Census Source ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sources/{source_id}")\n\n def get_destination(self, destination_id: int) -> Mapping[str, Any]:\n """Gets details about a given destination from the Census API.\n\n Args:\n destination_id (int): The Census Destination ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"destinations/{destination_id}")\n\n def get_sync_run(self, sync_run_id: int) -> Mapping[str, Any]:\n """Gets details about a specific sync run from the Census API.\n\n Args:\n sync_run_id (int): The Census Sync Run ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sync_runs/{sync_run_id}")\n\n def poll_sync_run(\n self,\n sync_run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Census sync run, poll until the run is complete.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n log_url = f"https://app.getcensus.com/syncs_runs/{sync_run_id}"\n poll_start = datetime.datetime.now()\n\n while True:\n time.sleep(poll_interval)\n response_dict = self.get_sync_run(sync_run_id)\n if "data" not in response_dict.keys():\n raise ValueError(\n f"Getting status of sync failed, please visit Census Logs at {log_url} to see"\n " more."\n )\n\n sync_status = response_dict["data"]["status"]\n sync_id = response_dict["data"]["sync_id"]\n\n if sync_status not in SYNC_RUN_STATUSES:\n raise ValueError(\n f"Unexpected response status '{sync_status}'; "\n f"must be one of {','.join(sorted(SYNC_RUN_STATUSES))}. "\n "See Management API docs for more information: "\n "https://docs.getcensus.com/basics/developers/api/sync-runs"\n )\n\n if sync_status in {"queued", "working"}:\n self._log.debug(\n f"Sync {sync_id} still running after {datetime.datetime.now() - poll_start}."\n )\n continue\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for sync '{sync_id}' timed out after"\n f" {datetime.datetime.now() - poll_start}."\n )\n\n break\n\n self._log.debug(\n f"Sync {sync_id} has finished running after {datetime.datetime.now() - poll_start}."\n )\n self._log.info(f"View sync details here: {log_url}.")\n\n return response_dict\n\n def trigger_sync(self, sync_id: int, force_full_sync: bool = False) -> Mapping[str, Any]:\n """Trigger an asynchronous run for a specific sync.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n data = {"force_full_sync": force_full_sync}\n return self.make_request(\n method="POST", endpoint=f"syncs/{sync_id}/trigger", data=json.dumps(data)\n )\n\n def trigger_sync_and_poll(\n self,\n sync_id: int,\n force_full_sync: bool = False,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> CensusOutput:\n """Trigger a run for a specific sync and poll until it has completed.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~CensusOutput`:\n Object containing details about the sync run and the sync details\n """\n sync_details = self.get_sync(sync_id=sync_id)\n source_details = self.get_source(\n source_id=sync_details["data"]["source_attributes"]["connection_id"]\n )["data"]\n destination_details = self.get_destination(\n destination_id=sync_details["data"]["destination_attributes"]["connection_id"]\n )["data"]\n\n trigger_sync_resp = self.trigger_sync(sync_id=sync_id, force_full_sync=force_full_sync)\n sync_run_details = self.poll_sync_run(\n sync_run_id=trigger_sync_resp["data"]["sync_run_id"],\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )["data"]\n return CensusOutput(\n sync_run=sync_run_details,\n source=source_details,\n destination=destination_details,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Census API Key.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description=(\n "The maximum number of times requests to the Census API should be retried "\n "before failing."\n ),\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Census connectors",\n)\ndef census_resource(context) -> CensusResource:\n """This resource allows users to programatically interface with the Census REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n @job(resource_defs={"census":my_census_resource})\n def my_census_job():\n ...\n\n """\n return CensusResource(\n api_key=context.resource_config["api_key"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_census/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.types

\nfrom typing import Any, Mapping, NamedTuple\n\n\n
[docs]class CensusOutput(\n NamedTuple(\n "_CensusOutput",\n [\n ("sync_run", Mapping[str, Any]),\n ("source", Mapping[str, Any]),\n ("destination", Mapping[str, Any]),\n ],\n )\n):\n """Contains recorded information about the state of a Census sync after a sync completes.\n\n Attributes:\n sync_run (Dict[str, Any]):\n The details of the specific sync run.\n source (Dict[str, Any]):\n Information about the source for the Census sync.\n destination (Dict[str, Any]):\n Information about the destination for the Census sync.\n """
\n
", "current_page_name": "_modules/dagster_census/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.types"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dask.executor

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dask\nimport dask.distributed\nfrom dagster import (\n    Executor,\n    Field,\n    Permissive,\n    Selector,\n    StringSource,\n    _check as check,\n    _seven,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.definitions.executor_definition import executor\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, execute_plan\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils import iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies: Any,\n recon_job: ReconstructableJob,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]],\n step_keys: Optional[Sequence[str]],\n instance_ref: InstanceRef,\n known_state: Optional[KnownExecutionState],\n) -> Sequence[DagsterEvent]:\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_job = recon_job.get_subset(op_selection=dagster_run.resolved_op_selection)\n\n execution_plan = create_execution_plan(\n subset_job,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_job, instance, dagster_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags: Mapping[str, str]):\n check.mapping_param(tags, "tags", key_type=str, value_type=str)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return _seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n f"Expected executor to be DaskExecutor got {plan_context.executor}",\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n job_name = plan_context.job_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(job_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(job_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(job_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(job_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(job_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(job_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(job_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(job_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(job_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(job_name))\n else:\n raise ValueError(\n "Must be providing one of the following ('existing', 'local', 'yarn', 'ssh',"\n f" 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n run_config = plan_context.run_config\n\n dask_task_name = "%s.%s" % (job_name, step.key)\n\n recon_job = plan_context.reconstructable_job\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_job,\n plan_context.dagster_run,\n run_config,\n [step.key],\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, job_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": job_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport logging\nimport time\nfrom typing import IO, Any, Mapping, Optional, Tuple, Union, cast\n\nimport dagster\nimport dagster._check as check\nimport dagster_pyspark\nimport databricks_api\nimport databricks_cli.sdk\nimport requests.exceptions\nfrom dagster._annotations import deprecated, public\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import compute, jobs\nfrom typing_extensions import Final\n\nimport dagster_databricks\n\nfrom .types import (\n    DatabricksRunState,\n)\nfrom .version import __version__\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC: Final = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\n
[docs]class DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n workspace_id: Optional[str] = None,\n ):\n self.host = host\n self.workspace_id = workspace_id\n\n self._workspace_client = WorkspaceClient(\n host=host,\n token=token,\n client_id=oauth_client_id,\n client_secret=oauth_client_secret,\n product="dagster-databricks",\n product_version=__version__,\n )\n\n # TODO: This is the old shim client that we were previously using. Arguably this is\n # confusing for users to use since this is an unofficial wrapper around the documented\n # Databricks REST API. We should consider removing this in the next minor release.\n if token:\n self._client = databricks_api.DatabricksAPI(host=host, token=token)\n self.__setup_user_agent(self._client.client)\n # TODO: This is the old `databricks_cli` client that was previously recommended by Databricks.\n # It is no longer supported and should be removed in favour of `databricks-sdk` in the next\n # minor release.\n self._api_client = databricks_cli.sdk.ApiClient(host=host, token=token)\n self.__setup_user_agent(self._api_client)\n else:\n self._client = None\n self._api_client = None\n\n def __setup_user_agent(\n self,\n client: Union[WorkspaceClient, databricks_api.DatabricksAPI, databricks_cli.sdk.ApiClient],\n ) -> None:\n """Overrides the user agent for the Databricks API client."""\n client.default_headers["user-agent"] = f"dagster-databricks/{__version__}"\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def client(self) -> databricks_api.DatabricksAPI:\n """Retrieve the legacy Databricks API client. Note: accessing this property will throw an exception if oauth\n credentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\n legacy Databricks API client.\n """\n if self._client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-api` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._client\n\n @client.setter\n def client(self, value: Optional[databricks_api.DatabricksAPI]) -> None:\n self._client = value\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def api_client(self) -> databricks_cli.sdk.ApiClient:\n """Retrieve a reference to the underlying Databricks API client. For more information,\n see the `Databricks Python API <https://docs.databricks.com/dev-tools/python-api.html>`_.\n Noe: accessing this property will throw an exception if oauth credentials are used to initialize the\n DatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\n **Examples:**.\n\n .. code-block:: python\n\n from dagster import op\n from databricks_cli.jobs.api import JobsApi\n from databricks_cli.runs.api import RunsApi\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n jobs_client = JobsApi(context.resources.databricks_client.api_client)\n runs_client = RunsApi(context.resources.databricks_client.api_client)\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n jobs_client.run_now(...)\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n runs_client.submit_run(...)\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n runs_client.get_run(...)\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n runs_client.cancel_run(...)\n client.jobs.cancel_run(...)\n\n Returns:\n ApiClient: The authenticated Databricks API client.\n """\n if self._api_client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-cli` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._api_client\n\n @public\n @property\n def workspace_client(self) -> WorkspaceClient:\n """Retrieve a reference to the underlying Databricks Workspace client. For more information,\n see the `Databricks SDK for Python <https://docs.databricks.com/dev-tools/sdk-python.html>`_.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n client.jobs.cancel_run(...)\n\n Returns:\n WorkspaceClient: The authenticated Databricks SDK Workspace Client.\n """\n return self._workspace_client\n\n def read_file(self, dbfs_path: str, block_size: int = 1024**2) -> bytes:\n """Read a file from DBFS to a **byte string**."""\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n data = b""\n bytes_read = 0\n dbfs_service = self.workspace_client.dbfs\n\n jdoc = dbfs_service.read(path=dbfs_path, length=block_size)\n data += base64.b64decode(jdoc.data)\n while jdoc.bytes_read == block_size:\n bytes_read += jdoc.bytes_read\n jdoc = dbfs_service.read(path=dbfs_path, offset=bytes_read, length=block_size)\n data += base64.b64decode(jdoc.data)\n\n return data\n\n def put_file(\n self, file_obj: IO, dbfs_path: str, overwrite: bool = False, block_size: int = 1024**2\n ) -> None:\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n dbfs_service = self.workspace_client.dbfs\n\n create_response = dbfs_service.create(path=dbfs_path, overwrite=overwrite)\n handle = create_response.handle\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n dbfs_service.add_block(data=data, handle=handle)\n block = file_obj.read(block_size)\n\n dbfs_service.close(handle=handle)\n\n def get_run_state(self, databricks_run_id: int) -> "DatabricksRunState":\n """Get the state of a run by Databricks run ID.\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.workspace_client.jobs.get_run(databricks_run_id)\n return DatabricksRunState.from_databricks(run.state)\n\n def poll_run_state(\n self,\n logger: logging.Logger,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n verbose_logs: bool = True,\n ) -> bool:\n run_state = self.get_run_state(databricks_run_id)\n\n if run_state.has_terminated():\n if run_state.is_successful():\n logger.info(f"Run `{databricks_run_id}` completed successfully.")\n return True\n if run_state.is_skipped():\n logger.info(f"Run `{databricks_run_id}` was skipped.")\n return True\n else:\n error_message = (\n f"Run `{databricks_run_id}` failed with result state:"\n f" `{run_state.result_state}`. Message: {run_state.state_message}."\n )\n logger.error(error_message)\n raise DatabricksError(error_message)\n else:\n if verbose_logs:\n logger.debug(f"Run `{databricks_run_id}` in state {run_state}.")\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n f"Run `{databricks_run_id}` took more than {max_wait_time_sec}s to complete."\n " Failing the run."\n )\n return False\n\n def wait_for_run_to_complete(\n self,\n logger: logging.Logger,\n databricks_run_id: int,\n poll_interval_sec: float,\n max_wait_time_sec: int,\n verbose_logs: bool = True,\n ) -> None:\n logger.info(f"Waiting for Databricks run `{databricks_run_id}` to complete...")\n\n start_poll_time = time.time()\n while True:\n if self.poll_run_state(\n logger=logger,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=max_wait_time_sec,\n verbose_logs=verbose_logs,\n ):\n return\n\n time.sleep(poll_interval_sec)
\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress.\n\n Attributes:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net.\n token (str): Databricks authentication token.\n poll_interval_sec (float): How often to poll Databricks for run status.\n max_wait_time_sec (int): How long to wait for a run to complete before failing.\n """\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n poll_interval_sec: float = 5,\n max_wait_time_sec: int = DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ):\n self.host = check.str_param(host, "host")\n check.invariant(\n token is None or (oauth_client_id is None and oauth_client_secret is None),\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.token = check.opt_str_param(token, "token")\n self.oauth_client_id = check.opt_str_param(oauth_client_id, "oauth_client_id")\n self.oauth_client_secret = check.opt_str_param(oauth_client_secret, "oauth_client_secret")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client: DatabricksClient = DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=oauth_client_id,\n oauth_client_secret=oauth_client_secret,\n )\n\n @property\n def client(self) -> DatabricksClient:\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config: Mapping[str, Any], task: Mapping[str, Any]) -> int:\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", {})\n if isinstance(tags, list):\n tags = {x["key"]: x["value"] for x in tags}\n tags["__dagster_version"] = dagster.__version__\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n install_default_libraries = run_config.get("install_default_libraries", True)\n if install_default_libraries:\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-")\n for x in libraries\n if "pypi" in x\n }\n\n for library_name, library in [\n ("dagster", dagster),\n ("dagster-databricks", dagster_databricks),\n ("dagster-pyspark", dagster_pyspark),\n ]:\n if library_name not in python_libraries:\n libraries.append(\n {"pypi": {"package": f"{library_name}=={library.__version__}"}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n return self.client.workspace_client.jobs.submit(\n run_name=run_config.get("run_name"),\n tasks=[\n jobs.SubmitTask.from_dict(\n {\n "new_cluster": new_cluster,\n "existing_cluster_id": existing_cluster_id,\n # "libraries": [compute.Library.from_dict(lib) for lib in libraries],\n "libraries": libraries,\n **task,\n "task_key": "dagster-task",\n },\n )\n ],\n ).bind()["run_id"]\n\n def retrieve_logs_for_run_id(\n self, log: logging.Logger, databricks_run_id: int\n ) -> Optional[Tuple[Optional[str], Optional[str]]]:\n """Retrieve the stdout and stderr logs for a run."""\n run = self.client.workspace_client.jobs.get_run(databricks_run_id)\n\n # Run.cluster_instance can be None. In that case, fall back to cluster instance on first\n # task. Currently pyspark step launcher runs jobs with singleton tasks.\n cluster_instance = run.cluster_instance or run.tasks[0].cluster_instance\n cluster_id = check.inst(\n cluster_instance.cluster_id,\n str,\n "cluster_id should be string like `1234-123456-abcdefgh` got:"\n f" `{cluster_instance.cluster_id}`",\n )\n cluster = self.client.workspace_client.clusters.get(cluster_id)\n log_config = cluster.cluster_log_conf\n if log_config is None:\n log.warn(\n f"Logs not configured for cluster {cluster_id} used for run {databricks_run_id}"\n )\n return None\n if cast(Optional[compute.S3StorageInfo], log_config.s3) is not None:\n logs_prefix = log_config.s3.destination\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif cast(Optional[compute.DbfsStorageInfo], log_config.dbfs) is not None:\n logs_prefix = log_config.dbfs.destination\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self,\n log: logging.Logger,\n prefix: str,\n cluster_id: str,\n filename: str,\n waiter_delay: int = 10,\n waiter_max_attempts: int = 10,\n ) -> Optional[str]:\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info(f"Retrieving logs from {path}")\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport gzip\nimport io\nimport os.path\nimport pickle\nimport sys\nimport tempfile\nimport time\nimport zlib\nfrom typing import Any, Dict, Iterator, Mapping, Optional, Sequence, cast\n\nfrom dagster import (\n    Bool,\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher, StepRunRef\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._serdes import deserialize_value\nfrom dagster._utils.backoff import backoff\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom databricks.sdk.core import DatabricksError\nfrom databricks.sdk.service import jobs\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n)\n\nfrom .configs import (\n    define_databricks_env_variables,\n    define_databricks_permissions,\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n    define_oauth_credentials,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\nDAGSTER_SYSTEM_ENV_VARS = {\n    "DAGSTER_CLOUD_DEPLOYMENT_NAME",\n    "DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT",\n    "DAGSTER_CLOUD_GIT_SHA",\n    "DAGSTER_CLOUD_GIT_TIMESTAMP",\n    "DAGSTER_CLOUD_GIT_AUTHOR_EMAIL",\n    "DAGSTER_CLOUD_GIT_AUTHOR_NAME",\n    "DAGSTER_CLOUD_GIT_MESSAGE",\n    "DAGSTER_CLOUD_GIT_BRANCH",\n    "DAGSTER_CLOUD_GIT_REPO",\n    "DAGSTER_CLOUD_PULL_REQUEST_ID",\n    "DAGSTER_CLOUD_PULL_REQUEST_STATUS",\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "permissions": define_databricks_permissions(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n Noneable(StringSource),\n default_value=None,\n description="Databricks access token",\n ),\n "oauth_credentials": define_oauth_credentials(),\n "env_variables": define_databricks_env_variables(),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, and if the specified cluster is configured to export logs, the system will"\n " wait after job completion for the logs to appear in the configured location. Note"\n " that logs are copied every 5 minutes, so enabling this will add several minutes"\n " to the job runtime. NOTE: this integration will export stdout/stderrfrom the"\n " remote Databricks process automatically, so this option is not generally"\n " necessary."\n ),\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description=(\n "If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step."\n ),\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description=(\n "How frequently Dagster will poll Databricks to determine the state of the job."\n ),\n ),\n "verbose_logs": Field(\n bool,\n default_value=True,\n description=(\n "Determines whether to display debug logs emitted while job is being polled. It can"\n " be helpful for Dagster UI performance to set to False when running long-running"\n " or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of"\n " debug logs."\n ),\n ),\n "add_dagster_env_variables": Field(\n bool,\n default_value=True,\n description=(\n "Automatically add Dagster system environment variables. This option is only"\n " applicable when the code being executed is deployed on Dagster Cloud. It will be"\n " ignored when the environment variables provided by Dagster Cloud are not present."\n ),\n ),\n }\n)\ndef databricks_pyspark_step_launcher(\n context: InitResourceContext,\n) -> "DatabricksPySparkStepLauncher":\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config: Mapping[str, Any],\n permissions: Mapping[str, Any],\n databricks_host: str,\n secrets_to_env_variables: Sequence[Mapping[str, Any]],\n staging_prefix: str,\n wait_for_logs: bool,\n max_completion_wait_time_seconds: int,\n databricks_token: Optional[str] = None,\n oauth_credentials: Optional[Mapping[str, str]] = None,\n env_variables: Optional[Mapping[str, str]] = None,\n storage: Optional[Mapping[str, Any]] = None,\n poll_interval_sec: int = 5,\n local_pipeline_package_path: Optional[str] = None,\n local_dagster_job_package_path: Optional[str] = None,\n verbose_logs: bool = True,\n add_dagster_env_variables: bool = True,\n ):\n self.run_config = check.mapping_param(run_config, "run_config")\n self.permissions = check.mapping_param(permissions, "permissions")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n\n check.invariant(\n databricks_token is not None or oauth_credentials is not None,\n "Must provide either databricks_token or oauth_credentials",\n )\n check.invariant(\n databricks_token is None or oauth_credentials is None,\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.databricks_token = check.opt_str_param(databricks_token, "databricks_token")\n oauth_credentials = check.opt_mapping_param(\n oauth_credentials,\n "oauth_credentials",\n key_type=str,\n value_type=str,\n )\n\n self.secrets = check.sequence_param(\n secrets_to_env_variables, "secrets_to_env_variables", dict\n )\n self.env_variables = check.opt_mapping_param(env_variables, "env_variables")\n self.storage = check.opt_mapping_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or"\n " 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and"\n " 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n oauth_client_id=oauth_credentials.get("client_id"),\n oauth_client_secret=oauth_credentials.get("client_secret"),\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n self.verbose_logs = check.bool_param(verbose_logs, "verbose_logs")\n self.add_dagster_env_variables = check.bool_param(\n add_dagster_env_variables, "add_dagster_env_variables"\n )\n\n def launch_step(self, step_context: StepExecutionContext) -> Iterator[DagsterEvent]:\n step_run_ref = step_context_to_step_run_ref(\n step_context, self.local_dagster_job_package_path\n )\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n if self.permissions:\n self._grant_permissions(log, databricks_run_id)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n except:\n # if executon is interrupted before the step is completed, cancel the run\n self.databricks_runner.client.workspace_client.jobs.cancel_run(databricks_run_id)\n raise\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log: DagsterLogManager, run_id: str, step_key: str) -> None:\n try:\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n sys.stdout.write(stdout)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stdout logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n try:\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n sys.stderr.write(stderr)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stderr logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n\n def step_events_iterator(\n self, step_context: StepExecutionContext, step_key: str, databricks_run_id: int\n ) -> Iterator[DagsterEvent]:\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start_poll_time = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n if self.verbose_logs:\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = self.databricks_runner.client.poll_run_state(\n logger=step_context.log,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=self.databricks_runner.max_wait_time_sec,\n verbose_logs=self.verbose_logs,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.get_dagster_event()\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(\n self, run_id: str, step_key: str, retry_number: int\n ) -> Sequence[EventLogEntry]:\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records() -> Sequence[EventLogEntry]:\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return cast(\n Sequence[EventLogEntry],\n deserialize_value(pickle.loads(gzip.decompress(serialized_records))),\n )\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError, OSError, zlib.error, EOFError),\n max_retries=4,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except DatabricksError as e:\n if e.error_code == "RESOURCE_DOES_NOT_EXIST":\n return []\n raise\n\n def _grant_permissions(\n self, log: DagsterLogManager, databricks_run_id: int, request_retries: int = 3\n ) -> None:\n client = self.databricks_runner.client.workspace_client\n # Retrieve run info\n cluster_id = None\n for i in range(1, request_retries + 1):\n run_info = client.jobs.get_run(databricks_run_id)\n # if a new job cluster is created, the cluster_instance key may not be immediately present in the run response\n try:\n cluster_id = run_info.cluster_instance.cluster_id\n break\n except:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id}. "\n f"Retrying {i} of {request_retries} times."\n )\n time.sleep(5)\n if not cluster_id:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id} "\n f"{request_retries} times. Skipping permission updates..."\n )\n return\n\n # Update job permissions\n if "job_permissions" in self.permissions:\n job_permissions = self._format_permissions(self.permissions["job_permissions"])\n job_id = run_info.job_id # type: ignore # (??)\n log.debug(f"Updating job permissions with following json: {job_permissions}")\n client.permissions.update("jobs", job_id, access_control_list=job_permissions)\n log.info("Successfully updated cluster permissions")\n\n # Update cluster permissions\n if "cluster_permissions" in self.permissions:\n if "existing" in self.run_config["cluster"]:\n raise ValueError(\n "Attempting to update permissions of an existing cluster. "\n "This is dangerous and thus unsupported."\n )\n cluster_permissions = self._format_permissions(self.permissions["cluster_permissions"])\n log.debug(f"Updating cluster permissions with following json: {cluster_permissions}")\n client.permissions.update(\n "clusters", cluster_id, access_control_list=cluster_permissions\n )\n log.info("Successfully updated cluster permissions")\n\n def _format_permissions(\n self, input_permissions: Mapping[str, Sequence[Mapping[str, str]]]\n ) -> Sequence[Mapping[str, str]]:\n access_control_list = []\n for permission, accessors in input_permissions.items():\n access_control_list.extend(\n [\n jobs.JobAccessControlRequest.from_dict(\n {"permission_level": permission, **accessor}\n )\n for accessor in accessors\n ]\n )\n return access_control_list\n\n def _get_databricks_task(self, run_id: str, step_key: str) -> Mapping[str, Any]:\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(\n self, log: DagsterLogManager, step_run_ref: StepRunRef, run_id: str, step_key: str\n ) -> None:\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = self.create_remote_config()\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def get_dagster_env_variables(self) -> Dict[str, str]:\n out = {}\n if self.add_dagster_env_variables:\n for var in DAGSTER_SYSTEM_ENV_VARS:\n if os.getenv(var):\n out.update({var: os.getenv(var)})\n return out\n\n def create_remote_config(self) -> "DatabricksConfig":\n env_variables = self.get_dagster_env_variables()\n env_variables.update(self.env_variables)\n databricks_config = DatabricksConfig(\n env_variables=env_variables,\n storage=self.storage,\n secrets=self.secrets,\n )\n return databricks_config\n\n def _log_logs_from_cluster(self, log: DagsterLogManager, run_id: int) -> None:\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self) -> str:\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self) -> str:\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"dbfs://{path}"\n\n def _internal_dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"/dbfs/{path}"\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(\n self,\n env_variables: Mapping[str, str],\n storage: Mapping[str, Any],\n secrets: Sequence[Mapping[str, Any]],\n ):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.env_variables = env_variables\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils: Any, sc: Any) -> None:\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils: Any, sc: Any) -> None:\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key) # noqa: SLF001\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key) # noqa: SLF001\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # noqa: SLF001\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils: Any) -> None:\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for env_k, env_v in self.env_variables.items():\n os.environ[env_k] = env_v\n\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print(f"Exporting {name} from Databricks secret {key}, scope {scope}") # noqa: T201\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.ops

\nfrom typing import TYPE_CHECKING, Optional\n\nfrom dagster import (\n    In,\n    Nothing,\n    OpExecutionContext,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom databricks.sdk.service import jobs\nfrom pydantic import Field\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n# wait at most 24 hours by default for run execution\nDEFAULT_MAX_WAIT_TIME_SECONDS = 24 * 60 * 60\nfrom dagster import Config\n\nif TYPE_CHECKING:\n    from .databricks import DatabricksClient\n\n\n
[docs]def create_databricks_run_now_op(\n databricks_job_id: int,\n databricks_job_configuration: Optional[dict] = None,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that launches an existing databricks job.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\n ``job_id``, which is the ID of the job to be executed. Additional fields can be used to specify\n override parameters for the Databricks Job.\n\n Arguments:\n databricks_job_id (int): The ID of the Databricks Job to be executed.\n databricks_job_configuration (dict): Configuration for triggering a new job run of a\n Databricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_run_now_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to run the Databricks Job.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\n DATABRICKS_JOB_ID = 1234\n\n\n run_now_op = create_databricks_run_now_op(\n databricks_job_id=DATABRICKS_JOB_ID,\n databricks_job_configuration={\n "python_params": [\n "--input",\n "schema.db.input_table",\n "--output",\n "schema.db.output_table",\n ],\n },\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n run_now_op()\n """\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksRunNowOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_run_now_op(context: OpExecutionContext, config: DatabricksRunNowOpConfig):\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.run_now(\n job_id=databricks_job_id,\n **(databricks_job_configuration or {}),\n )\n run_id = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_run_now_op
\n\n\n
[docs]def create_databricks_submit_run_op(\n databricks_job_configuration: dict,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that submits a one-time run of a set of tasks on Databricks.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/submit.\n\n Arguments:\n databricks_job_configuration (dict): Configuration for submitting a one-time run of a set\n of tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_submit_run_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to submit a one-time run of a set of tasks on Databricks.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\n submit_run_op = create_databricks_submit_run_op(\n databricks_job_configuration={\n "new_cluster": {\n "spark_version": '2.1.0-db3-scala2.11',\n "num_workers": 2\n },\n "notebook_task": {\n "notebook_path": "/Users/dagster@example.com/PrepareData",\n },\n }\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n submit_run_op()\n """\n check.invariant(\n bool(databricks_job_configuration),\n "Configuration for the one-time Databricks Job is required.",\n )\n\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksSubmitRunOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_submit_run_op(\n context: OpExecutionContext, config: DatabricksSubmitRunOpConfig\n ) -> None:\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.submit(\n tasks=[jobs.SubmitTask.from_dict(databricks_job_configuration)],\n )\n run_id: int = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_submit_run_op
\n
", "current_page_name": "_modules/dagster_databricks/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.ops"}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.pipes

\nimport base64\nimport json\nimport os\nimport random\nimport string\nimport sys\nimport time\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Iterator, Literal, Mapping, Optional, TextIO\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.resource_annotation import ResourceParam\nfrom dagster._core.errors import DagsterPipesExecutionError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.utils import (\n    PipesBlobStoreMessageReader,\n    PipesBlobStoreStdioReader,\n    PipesChunkedStdioReader,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    DAGSTER_PIPES_MESSAGES_ENV_VAR,\n    PipesContextData,\n    PipesExtras,\n    PipesParams,\n)\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import files, jobs\nfrom pydantic import Field\n\n# Number of seconds between status checks on Databricks jobs launched by the\n# `PipesDatabricksClient`.\n_RUN_POLL_INTERVAL = 5\n\n\n@experimental\nclass _PipesDatabricksClient(PipesClient):\n    """Pipes client for databricks.\n\n    Args:\n        client (WorkspaceClient): A databricks `WorkspaceClient` object.\n        env (Optional[Mapping[str,str]]: An optional dict of environment variables to pass to the\n            databricks job.\n        context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n            context into the k8s container process. Defaults to :py:class:`PipesDbfsContextInjector`.\n        message_reader (Optional[PipesMessageReader]): A message reader to use to read messages\n            from the databricks job. Defaults to :py:class:`PipesDbfsMessageReader`.\n    """\n\n    env: Optional[Mapping[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n\n    def __init__(\n        self,\n        client: WorkspaceClient,\n        env: Optional[Mapping[str, str]] = None,\n        context_injector: Optional[PipesContextInjector] = None,\n        message_reader: Optional[PipesMessageReader] = None,\n    ):\n        self.client = client\n        self.env = env\n        self.context_injector = check.opt_inst_param(\n            context_injector,\n            "context_injector",\n            PipesContextInjector,\n        ) or PipesDbfsContextInjector(client=self.client)\n        self.message_reader = check.opt_inst_param(\n            message_reader,\n            "message_reader",\n            PipesMessageReader,\n        ) or PipesDbfsMessageReader(\n            client=self.client,\n            stdout_reader=PipesDbfsStdioReader(\n                client=self.client, remote_log_name="stdout", target_stream=sys.stdout\n            ),\n            stderr_reader=PipesDbfsStdioReader(\n                client=self.client, remote_log_name="stderr", target_stream=sys.stderr\n            ),\n        )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def run(\n        self,\n        *,\n        context: OpExecutionContext,\n        extras: Optional[PipesExtras] = None,\n        task: jobs.SubmitTask,\n        submit_args: Optional[Mapping[str, str]] = None,\n    ) -> PipesClientCompletedInvocation:\n        """Synchronously execute a Databricks job with the pipes protocol.\n\n        Args:\n            task (databricks.sdk.service.jobs.SubmitTask): Specification of the databricks\n                task to run. Environment variables used by dagster-pipes will be set under the\n                `spark_env_vars` key of the `new_cluster` field (if there is an existing dictionary\n                here, the EXT environment variables will be merged in). Everything else will be\n                passed unaltered under the `tasks` arg to `WorkspaceClient.jobs.submit`.\n            context (OpExecutionContext): The context from the executing op or asset.\n            extras (Optional[PipesExtras]): An optional dict of extra parameters to pass to the\n                subprocess.\n            submit_args (Optional[Mapping[str, str]]): Additional keyword arguments that will be\n                forwarded as-is to `WorkspaceClient.jobs.submit`.\n\n        Returns:\n            PipesClientCompletedInvocation: Wrapper containing results reported by the external\n                process.\n        """\n        with open_pipes_session(\n            context=context,\n            extras=extras,\n            context_injector=self.context_injector,\n            message_reader=self.message_reader,\n        ) as pipes_session:\n            submit_task_dict = task.as_dict()\n            submit_task_dict["new_cluster"]["spark_env_vars"] = {\n                **submit_task_dict["new_cluster"].get("spark_env_vars", {}),\n                **(self.env or {}),\n                **pipes_session.get_bootstrap_env_vars(),\n            }\n            cluster_log_root = pipes_session.get_bootstrap_params()[\n                DAGSTER_PIPES_MESSAGES_ENV_VAR\n            ].get("cluster_log_root")\n            if cluster_log_root is not None:\n                submit_task_dict["new_cluster"]["cluster_log_conf"] = {\n                    "dbfs": {"destination": f"dbfs:{cluster_log_root}"}\n                }\n            task = jobs.SubmitTask.from_dict(submit_task_dict)\n            run_id = self.client.jobs.submit(\n                tasks=[task],\n                **(submit_args or {}),\n            ).bind()["run_id"]\n\n            while True:\n                run = self.client.jobs.get_run(run_id)\n                context.log.info(\n                    f"Databricks run {run_id} current state: {run.state.life_cycle_state}"\n                )\n                if run.state.life_cycle_state in (\n                    jobs.RunLifeCycleState.TERMINATED,\n                    jobs.RunLifeCycleState.SKIPPED,\n                ):\n                    if run.state.result_state == jobs.RunResultState.SUCCESS:\n                        break\n                    else:\n                        raise DagsterPipesExecutionError(\n                            f"Error running Databricks job: {run.state.state_message}"\n                        )\n                elif run.state.life_cycle_state == jobs.RunLifeCycleState.INTERNAL_ERROR:\n                    raise DagsterPipesExecutionError(\n                        f"Error running Databricks job: {run.state.state_message}"\n                    )\n                time.sleep(_RUN_POLL_INTERVAL)\n        return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n\nPipesDatabricksClient = ResourceParam[_PipesDatabricksClient]\n\n_CONTEXT_FILENAME = "context.json"\n\n\n@contextmanager\ndef dbfs_tempdir(dbfs_client: files.DbfsAPI) -> Iterator[str]:\n    dirname = "".join(random.choices(string.ascii_letters, k=30))\n    tempdir = f"/tmp/{dirname}"\n    dbfs_client.mkdirs(tempdir)\n    try:\n        yield tempdir\n    finally:\n        dbfs_client.delete(tempdir, recursive=True)\n\n\n
[docs]@experimental\nclass PipesDbfsContextInjector(PipesContextInjector):\n """A context injector that injects context into a Databricks job by writing a JSON file to DBFS.\n\n Args:\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n """\n\n def __init__(self, *, client: WorkspaceClient):\n super().__init__()\n self.dbfs_client = files.DbfsAPI(client.api_client)\n\n @contextmanager\n def inject_context(self, context: "PipesContextData") -> Iterator[PipesParams]:\n """Inject context to external environment by writing it to an automatically-generated\n DBFS temporary file as JSON and exposing the path to the file.\n\n Args:\n context_data (PipesContextData): The context data to inject.\n\n Yields:\n PipesParams: A dict of parameters that can be used by the external process to locate and\n load the injected context data.\n """\n with dbfs_tempdir(self.dbfs_client) as tempdir:\n path = os.path.join(tempdir, _CONTEXT_FILENAME)\n contents = base64.b64encode(json.dumps(context).encode("utf-8")).decode("utf-8")\n self.dbfs_client.put(path, contents=contents, overwrite=True)\n yield {"path": path}\n\n def no_messages_debug_text(self) -> str:\n return (\n "Attempted to inject context via a temporary file in dbfs. Expected"\n " PipesDbfsContextLoader to be explicitly passed to open_dagster_pipes in the external"\n " process."\n )
\n\n\n
[docs]@experimental\nclass PipesDbfsMessageReader(PipesBlobStoreMessageReader):\n """Message reader that reads messages by periodically reading message chunks from an\n automatically-generated temporary directory on DBFS.\n\n If `stdout_reader` or `stderr_reader` are passed, this reader will also start them when\n `read_messages` is called. If they are not passed, then the reader performs no stdout/stderr\n forwarding.\n\n Args:\n interval (float): interval in seconds between attempts to download a chunk\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n cluster_log_root (Optional[str]): The root path on DBFS where the cluster logs are written.\n If set, this will be used to read stderr/stdout logs.\n stdout_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stdout logs.\n stderr_reader (Optional[PipesBlobStoreStdioReader]): A reader for reading stderr logs.\n """\n\n def __init__(\n self,\n *,\n interval: float = 10,\n client: WorkspaceClient,\n stdout_reader: Optional[PipesBlobStoreStdioReader] = None,\n stderr_reader: Optional[PipesBlobStoreStdioReader] = None,\n ):\n super().__init__(\n interval=interval, stdout_reader=stdout_reader, stderr_reader=stderr_reader\n )\n self.dbfs_client = files.DbfsAPI(client.api_client)\n\n @contextmanager\n def get_params(self) -> Iterator[PipesParams]:\n with ExitStack() as stack:\n params: PipesParams = {}\n params["path"] = stack.enter_context(dbfs_tempdir(self.dbfs_client))\n if self.stdout_reader or self.stderr_reader:\n params["cluster_log_root"] = stack.enter_context(dbfs_tempdir(self.dbfs_client))\n yield params\n\n def download_messages_chunk(self, index: int, params: PipesParams) -> Optional[str]:\n message_path = os.path.join(params["path"], f"{index}.json")\n try:\n raw_message = self.dbfs_client.read(message_path)\n # Files written to dbfs using the Python IO interface used in PipesDbfsMessageWriter are\n # base64-encoded.\n return base64.b64decode(raw_message.data).decode("utf-8")\n # An error here is an expected result, since an IOError will be thrown if the next message\n # chunk doesn't yet exist. Swallowing the error here is equivalent to doing a no-op on a\n # status check showing a non-existent file.\n except IOError:\n return None\n\n def no_messages_debug_text(self) -> str:\n return (\n "Attempted to read messages from a temporary file in dbfs. Expected"\n " PipesDbfsMessageWriter to be explicitly passed to open_dagster_pipes in the external"\n " process."\n )
\n\n\n@experimental\nclass PipesDbfsStdioReader(PipesChunkedStdioReader):\n """Reader that reads stdout/stderr logs from DBFS.\n\n Args:\n interval (float): interval in seconds between attempts to download a log chunk\n remote_log_name (Literal["stdout", "stderr"]): The name of the log file to read.\n target_stream (TextIO): The stream to which to forward log chunk that have been read.\n client (WorkspaceClient): A databricks `WorkspaceClient` object.\n """\n\n def __init__(\n self,\n *,\n interval: float = 10,\n remote_log_name: Literal["stdout", "stderr"],\n target_stream: TextIO,\n client: WorkspaceClient,\n ):\n super().__init__(interval=interval, target_stream=target_stream)\n self.dbfs_client = files.DbfsAPI(client.api_client)\n self.remote_log_name = remote_log_name\n self.log_position = 0\n self.log_path = None\n\n def download_log_chunk(self, params: PipesParams) -> Optional[str]:\n log_path = self._get_log_path(params)\n if log_path is None:\n return None\n else:\n try:\n read_response = self.dbfs_client.read(log_path)\n assert read_response.data\n content = base64.b64decode(read_response.data).decode("utf-8")\n chunk = content[self.log_position :]\n self.log_position = len(content)\n return chunk\n except IOError:\n return None\n\n def is_ready(self, params: PipesParams) -> bool:\n return self._get_log_path(params) is not None\n\n # The directory containing logs will not exist until either 5 minutes have elapsed or the\n # job has finished.\n def _get_log_path(self, params: PipesParams) -> Optional[str]:\n if self.log_path is None:\n log_root_path = os.path.join(params["cluster_log_root"])\n child_dirs = list(self.dbfs_client.list(log_root_path))\n if len(child_dirs) > 0:\n self.log_path = f"dbfs:{child_dirs[0].path}/driver/{self.remote_log_name}"\n return self.log_path\n
", "current_page_name": "_modules/dagster_databricks/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.pipes"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.resources

\nfrom typing import Any, Optional\n\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field, root_validator\n\nfrom .databricks import DatabricksClient\n\n\nclass OauthCredentials(Config):\n    """OAuth credentials for Databricks.\n\n    See https://docs.databricks.com/dev-tools/api/latest/authentication.html#oauth-2-0.\n    """\n\n    client_id: str = Field(description="OAuth client ID")\n    client_secret: str = Field(description="OAuth client secret")\n\n\n
[docs]class DatabricksClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource which provides a Python client for interacting with Databricks within an\n op or asset.\n """\n\n host: str = Field(description="Databricks host, e.g. https://uksouth.azuredatabricks.com")\n token: Optional[str] = Field(default=None, description="Databricks access token")\n oauth_credentials: Optional[OauthCredentials] = Field(\n default=None,\n description=(\n "Databricks OAuth credentials for using a service principal. See"\n " https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0"\n ),\n )\n workspace_id: Optional[str] = Field(\n default=None,\n description=(\n "DEPRECATED: The Databricks workspace ID, as described in"\n " https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids."\n " This is no longer used and will be removed in a 0.21."\n ),\n )\n\n @root_validator()\n def has_token_or_oauth_credentials(cls, values):\n token = values.get("token")\n oauth_credentials = values.get("oauth_credentials")\n if not token and not oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials")\n if token and oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials, not both")\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatabricksClient:\n if self.oauth_credentials:\n client_id = self.oauth_credentials.client_id\n client_secret = self.oauth_credentials.client_secret\n else:\n client_id = None\n client_secret = None\n\n return DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=client_id,\n oauth_client_secret=client_secret,\n workspace_id=self.workspace_id,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatabricksClientResource.to_config_schema())\ndef databricks_client(init_context) -> DatabricksClient:\n return DatabricksClientResource.from_resource_context(init_context).get_client()
\n
", "current_page_name": "_modules/dagster_databricks/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.resources"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datadog.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datadog import DogStatsd, initialize, statsd\nfrom pydantic import Field\n\n\nclass DatadogClient:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key: str, app_key: str):\n        self.api_key = api_key\n        self.app_key = app_key\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]class DatadogResource(ConfigurableResource):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op\n def datadog_op(datadog_client: ResourceParam[DatadogClient]):\n datadog_client.event('Man down!', 'This server needs assistance.')\n datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n datadog_client.increment('page.views')\n datadog_client.decrement('page.views')\n datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n datadog_client.timing("query.response.time", 1234)\n\n # Use timed decorator\n @datadog_client.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job\n def job_for_datadog_op() -> None:\n datadog_op()\n\n job_for_datadog_op.execute_in_process(\n resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n )\n\n """\n\n api_key: str = Field(\n description=(\n "Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/"\n )\n )\n app_key: str = Field(\n description=(\n "Datadog application key. See"\n " https://docs.datadoghq.com/account_management/api-app-keys/."\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatadogClient:\n return DatadogClient(self.api_key, self.app_key)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DatadogResource.to_config_schema(),\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context) -> DatadogClient:\n """This legacy resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n Prefer using :py:class:`DatadogResource`.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DatadogResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datadog.resources"}}, "dagster_datahub": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datahub.resources

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import InitResourceContext, resource\nfrom dagster._config.pythonic_config import Config, ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datahub.emitter.kafka_emitter import (\n    DEFAULT_MCE_KAFKA_TOPIC,\n    DEFAULT_MCP_KAFKA_TOPIC,\n    MCE_KEY,\n    MCP_KEY,\n    DatahubKafkaEmitter,\n    KafkaEmitterConfig,\n)\nfrom datahub.emitter.rest_emitter import DatahubRestEmitter\nfrom pydantic import Field\n\n\n
[docs]class DatahubRESTEmitterResource(ConfigurableResource):\n connection: str = Field(description="Datahub GMS Server")\n token: Optional[str] = Field(default=None, description="Personal Access Token")\n connect_timeout_sec: Optional[float] = None\n read_timeout_sec: Optional[float] = None\n retry_status_codes: Optional[List[int]] = None\n retry_methods: Optional[List[str]] = None\n retry_max_times: Optional[int] = None\n extra_headers: Optional[Dict[str, str]] = None\n ca_certificate_path: Optional[str] = None\n server_telemetry_id: Optional[str] = None # No-op - no longer accepted in DatahubRestEmitter\n disable_ssl_verification: bool = False\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubRestEmitter:\n return DatahubRestEmitter(\n gms_server=self.connection,\n token=self.token,\n connect_timeout_sec=self.connect_timeout_sec,\n read_timeout_sec=self.read_timeout_sec,\n retry_status_codes=self.retry_status_codes,\n retry_methods=self.retry_methods,\n retry_max_times=self.retry_max_times,\n extra_headers=self.extra_headers,\n ca_certificate_path=self.ca_certificate_path,\n disable_ssl_verification=self.disable_ssl_verification,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubRESTEmitterResource.to_config_schema())\ndef datahub_rest_emitter(init_context: InitResourceContext) -> DatahubRestEmitter:\n emitter = DatahubRestEmitter(\n gms_server=init_context.resource_config.get("connection"),\n token=init_context.resource_config.get("token"),\n connect_timeout_sec=init_context.resource_config.get("connect_timeout_sec"),\n read_timeout_sec=init_context.resource_config.get("read_timeout_sec"),\n retry_status_codes=init_context.resource_config.get("retry_status_codes"),\n retry_methods=init_context.resource_config.get("retry_methods"),\n retry_max_times=init_context.resource_config.get("retry_max_times"),\n extra_headers=init_context.resource_config.get("extra_headers"),\n ca_certificate_path=init_context.resource_config.get("ca_certificate_path"),\n disable_ssl_verification=init_context.resource_config.get("disable_ssl_verification"),\n )\n # Attempt to hit the server to ensure the resource is properly configured\n emitter.test_connection()\n return emitter
\n\n\nclass DatahubConnection(Config):\n bootstrap: str = Field(description="Kafka Boostrap Servers. Comma delimited")\n schema_registry_url: str = Field(description="Schema Registry Location.")\n schema_registry_config: Dict[str, Any] = Field(\n default={}, description="Extra Schema Registry Config."\n )\n\n\n
[docs]class DatahubKafkaEmitterResource(ConfigurableResource):\n connection: DatahubConnection\n topic: Optional[str] = None\n topic_routes: Dict[str, str] = Field(\n default={\n MCE_KEY: DEFAULT_MCE_KAFKA_TOPIC,\n MCP_KEY: DEFAULT_MCP_KAFKA_TOPIC,\n }\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(\n KafkaEmitterConfig.parse_obj(\n {k: v for k, v in self._convert_to_config_dictionary().items() if v is not None}\n )\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubKafkaEmitterResource.to_config_schema())\ndef datahub_kafka_emitter(init_context: InitResourceContext) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(KafkaEmitterConfig.parse_obj(init_context.resource_config))
\n
", "current_page_name": "_modules/dagster_datahub/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datahub.resources"}}, "dagster_dbt": {"asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_decorator

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    BackfillPolicy,\n    DagsterInvalidDefinitionError,\n    Nothing,\n    PartitionsDefinition,\n    multi_asset,\n)\n\nfrom .asset_utils import (\n    DAGSTER_DBT_TRANSLATOR_METADATA_KEY,\n    MANIFEST_METADATA_KEY,\n    default_asset_check_fn,\n    default_code_version_fn,\n    get_deps,\n)\nfrom .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    output_name_fn,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]def dbt_assets(\n *,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Callable[..., AssetsDefinition]:\n """Create a definition for how to compute a set of dbt resources, described by a manifest.json.\n When invoking dbt commands using :py:class:`~dagster_dbt.DbtCliResource`'s\n :py:meth:`~dagster_dbt.DbtCliResource.cli` method, Dagster events are emitted by calling\n ``yield from`` on the event stream returned by :py:meth:`~dagster_dbt.DbtCliInvocation.stream`.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The contents of a manifest.json file\n or the path to a manifest.json file. A manifest.json contains a representation of a\n dbt project (models, tests, macros, etc). We use this representation to create\n corresponding Dagster assets.\n select (str): A dbt selection string for the models in a project that you want\n to include. Defaults to ``fqn:*``.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n backfill_policy (Optional[BackfillPolicy]): If a partitions_def is defined, this determines\n how to execute backfills that target multiple partitions.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the assets.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n\n Examples:\n Running ``dbt build`` for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Running dbt commands with flags:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n\n Running dbt commands with ``--vars``:\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_vars = {"key": "value"}\n\n yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n\n Retrieving dbt artifacts after running a dbt command:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_build_invocation = dbt.cli(["build"], context=context)\n\n yield from dbt_build_invocation.stream()\n\n run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n\n Running multiple dbt commands for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n yield from dbt.cli(["test"], context=context).stream()\n\n Customizing the Dagster asset metadata inferred from a dbt project using :py:class:`~dagster_dbt.DagsterDbtTranslator`:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n ...\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n dagster_dbt_translator=CustomDagsterDbtTranslator(),\n )\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Invoking another Dagster :py:class:`~dagster.ResourceDefinition` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n from dagster_slack import SlackResource\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n slack_client = slack.get_client()\n slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n\n Defining and accessing Dagster :py:class:`~dagster.Config` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext, Config\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class MyDbtConfig(Config):\n full_refresh: bool\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n dbt_build_args = ["build"]\n if config.full_refresh:\n dbt_build_args += ["--full-refresh"]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n Defining Dagster :py:class:`~dagster.PartitionDefinition` alongside dbt:\n\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext, DailyPartitionDefinition\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n )\n def partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n time_window = context.asset_partitions_time_window_for_output(\n list(context.selected_output_names)[0]\n )\n\n dbt_vars = {\n "min_date": time_window.start.isoformat(),\n "max_date": time_window.end.isoformat()\n }\n dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n """\n check.inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n additional_message=(\n "Ensure that the argument is an instantiated class that subclasses"\n " DagsterDbtTranslator."\n ),\n )\n manifest = validate_manifest(manifest)\n\n unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude or "", manifest_json=manifest\n )\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n deps = get_deps(\n dbt_nodes=node_info_by_dbt_unique_id,\n selected_unique_ids=unique_ids,\n asset_resource_types=ASSET_RESOURCE_TYPES,\n )\n (\n non_argument_deps,\n outs,\n internal_asset_deps,\n check_specs,\n ) = get_dbt_multi_asset_args(\n dbt_nodes=node_info_by_dbt_unique_id,\n deps=deps,\n io_manager_key=io_manager_key,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n )\n\n if op_tags and "dagster-dbt/select" in op_tags:\n raise DagsterInvalidDefinitionError(\n "To specify a dbt selection, use the 'select' argument, not 'dagster-dbt/select'"\n " with op_tags"\n )\n\n if op_tags and "dagster-dbt/exclude" in op_tags:\n raise DagsterInvalidDefinitionError(\n "To specify a dbt exclusion, use the 'exclude' argument, not 'dagster-dbt/exclude'"\n " with op_tags"\n )\n\n resolved_op_tags = {\n **({"dagster-dbt/select": select} if select else {}),\n **({"dagster-dbt/exclude": exclude} if exclude else {}),\n **(op_tags if op_tags else {}),\n }\n\n def inner(fn) -> AssetsDefinition:\n asset_definition = multi_asset(\n outs=outs,\n internal_asset_deps=internal_asset_deps,\n deps=non_argument_deps,\n compute_kind="dbt",\n partitions_def=partitions_def,\n can_subset=True,\n op_tags=resolved_op_tags,\n check_specs=check_specs,\n backfill_policy=backfill_policy,\n )(fn)\n\n return asset_definition\n\n return inner
\n\n\ndef get_dbt_multi_asset_args(\n dbt_nodes: Mapping[str, Any],\n deps: Mapping[str, FrozenSet[str]],\n io_manager_key: Optional[str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n) -> Tuple[\n Sequence[AssetKey],\n Dict[str, AssetOut],\n Dict[str, Set[AssetKey]],\n Sequence[AssetCheckSpec],\n]:\n non_argument_deps: Set[AssetKey] = set()\n outs: Dict[str, AssetOut] = {}\n internal_asset_deps: Dict[str, Set[AssetKey]] = {}\n check_specs: Sequence[AssetCheckSpec] = []\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n outs[output_name] = AssetOut(\n key=asset_key,\n dagster_type=Nothing,\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n is_required=False,\n metadata={ # type: ignore\n **dagster_dbt_translator.get_metadata(dbt_resource_props),\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest),\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n group_name=dagster_dbt_translator.get_group_name(dbt_resource_props),\n code_version=default_code_version_fn(dbt_resource_props),\n freshness_policy=dagster_dbt_translator.get_freshness_policy(dbt_resource_props),\n auto_materialize_policy=dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n ),\n )\n\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(\n asset_key, unique_id, dagster_dbt_translator.settings, test_resource_props\n )\n\n if check_spec:\n check_specs.append(check_spec)\n\n # Translate parent unique ids to internal asset deps and non argument dep\n output_internal_deps = internal_asset_deps.setdefault(output_name, set())\n for parent_unique_id in parent_unique_ids:\n parent_resource_props = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_resource_props)\n\n # Add this parent as an internal dependency\n output_internal_deps.add(parent_asset_key)\n\n # Mark this parent as an input if it has no dependencies\n if parent_unique_id not in deps:\n non_argument_deps.add(parent_asset_key)\n\n return list(non_argument_deps), outs, internal_asset_deps, check_specs\n
", "current_page_name": "_modules/dagster_dbt/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_decorator"}, "asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_defs

\nimport hashlib\nimport json\nimport os\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetCheckResult,\n    AssetKey,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    In,\n    OpExecutionContext,\n    Out,\n    PartitionsDefinition,\n    PermissiveConfig,\n    _check as check,\n    get_dagster_logger,\n    op,\n)\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKeyPrefix,\n    Output,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput, RawMetadataValue\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.core.resources import DbtCliClient\nfrom dagster_dbt.core.resources_v2 import DbtCliResource\nfrom dagster_dbt.core.types import DbtCliOutput\nfrom dagster_dbt.core.utils import build_command_args_from_flags, execute_cli\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\nfrom dagster_dbt.errors import DagsterDbtError\nfrom dagster_dbt.types import DbtOutput\nfrom dagster_dbt.utils import (\n    ASSET_RESOURCE_TYPES,\n    output_name_fn,\n    result_to_events,\n    select_unique_ids_from_manifest,\n)\n\n\ndef _load_manifest_for_project(\n    project_dir: str,\n    profiles_dir: str,\n    target_dir: str,\n    select: str,\n    exclude: str,\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "exclude": exclude,\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n        json_log_format=True,\n        capture_logs=True,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r", encoding="utf8") as f:\n        return json.load(f), cli_output\n\n\ndef _can_stream_events(dbt_resource: Union[DbtCliClient, DbtCliResource]) -> bool:\n    """Check if the installed dbt version supports streaming events."""\n    import dbt.version\n    from packaging import version\n\n    if version.parse(dbt.version.__version__) >= version.parse("1.4.0"):\n        # The json log format is required for streaming events. DbtCliResource always uses this format, but\n        # DbtCliClient has an option to disable it.\n        if isinstance(dbt_resource, DbtCliResource):\n            return True\n        else:\n            return dbt_resource._json_log_format  # noqa: SLF001\n    else:\n        return False\n\n\ndef _batch_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: DbtCliClient,\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n) -> Iterator[Union[AssetObservation, AssetMaterialization, Output]]:\n    """Yields events for a dbt cli invocation. Waits until the entire command has completed before\n    emitting outputs.\n    """\n    # clean up any run results from the last run\n    dbt_resource.remove_run_results_json()\n\n    dbt_output: Optional[DbtOutput] = None\n    try:\n        if use_build_command:\n            dbt_output = dbt_resource.build(**kwargs)\n        else:\n            dbt_output = dbt_resource.run(**kwargs)\n    finally:\n        # in the case that the project only partially runs successfully, still attempt to generate\n        # events for the parts that were successful\n        if dbt_output is None:\n            dbt_output = DbtOutput(result=check.not_none(dbt_resource.get_run_results_json()))\n\n        manifest_json = check.not_none(dbt_resource.get_manifest_json())\n\n        dbt_output = check.not_none(dbt_output)\n        for result in dbt_output.result["results"]:\n            extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None\n            if runtime_metadata_fn:\n                node_info = manifest_json["nodes"][result["unique_id"]]\n                extra_metadata = runtime_metadata_fn(context, node_info)\n            yield from result_to_events(\n                result=result,\n                docs_url=dbt_output.docs_url,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=manifest_json,\n                extra_metadata=extra_metadata,\n                generate_asset_outputs=True,\n            )\n\n\ndef _events_for_structured_json_line(\n    json_line: Mapping[str, Any],\n    context: OpExecutionContext,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output]]:\n    """Parses a json line into a Dagster event. Attempts to replicate the behavior of result_to_events\n    as closely as possible.\n    """\n    runtime_node_info = json_line.get("data", {}).get("node_info", {})\n    if not runtime_node_info:\n        return\n\n    node_resource_type = runtime_node_info.get("resource_type")\n    node_status = runtime_node_info.get("node_status")\n    unique_id = runtime_node_info.get("unique_id")\n\n    if not node_resource_type or not unique_id:\n        return\n\n    compiled_node_info = manifest_json["nodes"][unique_id]\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and node_status == "success":\n        metadata = dict(\n            runtime_metadata_fn(context, compiled_node_info) if runtime_metadata_fn else {}\n        )\n        started_at_str = runtime_node_info.get("node_started_at")\n        finished_at_str = runtime_node_info.get("node_finished_at")\n        if started_at_str is None or finished_at_str is None:\n            return\n\n        started_at = dateutil.parser.isoparse(started_at_str)  # type: ignore\n        completed_at = dateutil.parser.isoparse(finished_at_str)  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                "Execution Started At": started_at.isoformat(timespec="seconds"),\n                "Execution Completed At": completed_at.isoformat(timespec="seconds"),\n                "Execution Duration": duration.total_seconds(),\n            }\n        )\n        yield Output(\n            value=None,\n            output_name=output_name_fn(compiled_node_info),\n            metadata=metadata,\n        )\n    elif node_resource_type == "test" and runtime_node_info.get("node_finished_at"):\n        upstream_unique_ids = (\n            manifest_json["nodes"][unique_id].get("depends_on", {}).get("nodes", [])\n        )\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            upstream_node_info = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if upstream_node_info is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(upstream_node_info)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": unique_id,\n                    "Test Status": node_status,\n                },\n            )\n\n\ndef _stream_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: Union[DbtCliResource, DbtCliClient],\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output, AssetCheckResult]]:\n    """Yields events for a dbt cli invocation. Emits outputs as soon as the relevant dbt logs are\n    emitted.\n    """\n    if isinstance(dbt_resource, DbtCliClient):\n        for parsed_json_line in dbt_resource.cli_stream_json(\n            command="build" if use_build_command else "run",\n            **kwargs,\n        ):\n            yield from _events_for_structured_json_line(\n                parsed_json_line,\n                context,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                manifest_json,\n            )\n    else:\n        if runtime_metadata_fn is not None:\n            raise DagsterDbtError(\n                "The runtime_metadata_fn argument on the load_assets_from_dbt_manifest and"\n                " load_assets_from_dbt_project functions is not supported when using the"\n                " DbtCliResource resource. Use the @dbt_assets decorator instead if you want"\n                " control over what metadata is yielded at runtime."\n            )\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n                return node_info_to_asset_key(dbt_resource_props)\n\n        cli_output = dbt_resource.cli(\n            args=["build" if use_build_command else "run", *build_command_args_from_flags(kwargs)],\n            manifest=manifest_json,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n        )\n        yield from cli_output.stream()\n\n\nclass DbtOpConfig(PermissiveConfig):\n    """Keyword arguments to pass to the underlying dbt command. Additional arguments not listed in the schema will\n    be passed through as well, e.g. {'bool_flag': True, 'string_flag': 'hi'} will result in the flags\n    '--bool_flag --string_flag hi' being passed to the dbt command.\n    """\n\n    select: Optional[str] = None\n    exclude: Optional[str] = None\n    vars: Optional[Dict[str, Any]] = None\n    full_refresh: Optional[bool] = None\n\n\ndef _get_dbt_op(\n    op_name: str,\n    ins: Mapping[str, In],\n    outs: Mapping[str, Out],\n    select: str,\n    exclude: str,\n    use_build_command: bool,\n    fqns_by_output_name: Mapping[str, List[str]],\n    dbt_resource_key: str,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n):\n    @op(\n        name=op_name,\n        tags={"kind": "dbt"},\n        ins=ins,\n        out=outs,\n        required_resource_keys={dbt_resource_key},\n    )\n    def _dbt_op(context, config: DbtOpConfig):\n        dbt_resource: Union[DbtCliResource, DbtCliClient] = getattr(\n            context.resources, dbt_resource_key\n        )\n        check.inst(\n            dbt_resource,\n            (DbtCliResource, DbtCliClient),\n            "Resource with key 'dbt_resource_key' must be a DbtCliResource or DbtCliClient"\n            f" object, but is a {type(dbt_resource)}",\n        )\n\n        kwargs: Dict[str, Any] = {}\n        # in the case that we're running everything, opt for the cleaner selection string\n        if len(context.selected_output_names) == len(outs):\n            kwargs["select"] = select\n            kwargs["exclude"] = exclude\n        else:\n            # for each output that we want to emit, translate to a dbt select string by converting\n            # the out to its corresponding fqn\n            kwargs["select"] = [\n                ".".join(fqns_by_output_name[output_name])\n                for output_name in context.selected_output_names\n            ]\n        # variables to pass into the command\n        if partition_key_to_vars_fn:\n            kwargs["vars"] = partition_key_to_vars_fn(context.partition_key)\n        # merge in any additional kwargs from the config\n        kwargs = deep_merge_dicts(kwargs, context.op_config)\n\n        if _can_stream_events(dbt_resource):\n            yield from _stream_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n                manifest_json=manifest_json,\n            )\n        else:\n            if not isinstance(dbt_resource, DbtCliClient):\n                check.failed(\n                    "Chose batch event iterator, but it only works with DbtCliClient, and"\n                    f" resource has type {type(dbt_resource)}"\n                )\n            yield from _batch_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n            )\n\n    return _dbt_op\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    exclude: str,\n    selected_unique_ids: AbstractSet[str],\n    project_id: str,\n    dbt_resource_key: str,\n    manifest_json: Mapping[str, Any],\n    op_name: Optional[str],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    io_manager_key: Optional[str],\n    use_build_command: bool,\n    partitions_def: Optional[PartitionsDefinition],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    dagster_dbt_translator: DagsterDbtTranslator,\n) -> AssetsDefinition:\n    if use_build_command:\n        deps = get_deps(\n            dbt_nodes,\n            selected_unique_ids,\n            asset_resource_types=["model", "seed", "snapshot"],\n        )\n    else:\n        deps = get_deps(dbt_nodes, selected_unique_ids, asset_resource_types=["model"])\n\n    (\n        asset_deps,\n        asset_ins,\n        asset_outs,\n        group_names_by_key,\n        freshness_policies_by_key,\n        auto_materialize_policies_by_key,\n        check_specs_by_output_name,\n        fqns_by_output_name,\n        _,\n    ) = get_asset_deps(\n        dbt_nodes=dbt_nodes,\n        deps=deps,\n        io_manager_key=io_manager_key,\n        manifest=manifest_json,\n        dagster_dbt_translator=dagster_dbt_translator,\n    )\n\n    # prevent op name collisions between multiple dbt multi-assets\n    if not op_name:\n        op_name = f"run_dbt_{project_id}"\n        if select != "fqn:*" or exclude:\n            op_name += "_" + hashlib.md5(select.encode() + exclude.encode()).hexdigest()[-5:]\n\n    check_outs_by_output_name: Mapping[str, Out] = {}\n    if check_specs_by_output_name:\n        check_outs_by_output_name = {\n            output_name: Out(dagster_type=None, is_required=False)\n            for output_name in check_specs_by_output_name.keys()\n        }\n\n    dbt_op = _get_dbt_op(\n        op_name=op_name,\n        ins=dict(asset_ins.values()),\n        outs={\n            **dict(asset_outs.values()),\n            **check_outs_by_output_name,\n        },\n        select=select,\n        exclude=exclude,\n        use_build_command=use_build_command,\n        fqns_by_output_name=fqns_by_output_name,\n        dbt_resource_key=dbt_resource_key,\n        node_info_to_asset_key=dagster_dbt_translator.get_asset_key,\n        partition_key_to_vars_fn=partition_key_to_vars_fn,\n        runtime_metadata_fn=runtime_metadata_fn,\n        manifest_json=manifest_json,\n    )\n\n    return AssetsDefinition(\n        keys_by_input_name={\n            input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n        },\n        keys_by_output_name={\n            output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n        },\n        node_def=dbt_op,\n        can_subset=True,\n        asset_deps=asset_deps,\n        group_names_by_key=group_names_by_key,\n        freshness_policies_by_key=freshness_policies_by_key,\n        auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n        check_specs_by_output_name=check_specs_by_output_name,\n        partitions_def=partitions_def,\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n io_manager_key: Optional[str] = None,\n target_dir: Optional[str] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n op_name: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models from a dbt project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` or `dbt build` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n project_dir (Optional[str]): The directory containing the dbt project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where dbt will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n manifest_json (Optional[Mapping[str, Any]]): [Deprecated] Use the manifest argument instead.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model. Deprecated: instead,\n provide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n project_dir = check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n select = check.opt_str_param(select, "select", "fqn:*")\n exclude = check.opt_str_param(exclude, "exclude", "")\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=None,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n manifest, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select, exclude\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n dagster_dbt_translator=dagster_dbt_translator,\n op_name=op_name,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n )
\n\n\n
[docs]@deprecated_param(\n param="manifest_json", breaking_version="0.21", additional_warn_text="Use manifest instead"\n)\n@deprecated_param(\n param="selected_unique_ids",\n breaking_version="0.21",\n additional_warn_text="Use the select parameter instead.",\n)\n@deprecated_param(\n param="dbt_resource_key",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize your resource key."\n ),\n)\n@deprecated_param(\n param="use_build_command",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize the underlying dbt commands."\n ),\n)\n@deprecated_param(\n param="partitions_def",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="partition_key_to_vars_fn",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="runtime_metadata_fn",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize runtime metadata."\n ),\n)\ndef load_assets_from_dbt_manifest(\n manifest: Optional[Union[Path, Mapping[str, Any]]] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n op_name: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n manifest (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n manifest = normalize_renamed_param(\n manifest,\n "manifest",\n manifest_json,\n "manifest_json",\n )\n manifest = cast(\n Union[Mapping[str, Any], Path], check.inst_param(manifest, "manifest", (Path, dict))\n )\n if isinstance(manifest, Path):\n manifest = cast(Mapping[str, Any], json.loads(manifest.read_bytes()))\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n io_manager_key=io_manager_key,\n dagster_dbt_translator=dagster_dbt_translator,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n selected_unique_ids=selected_unique_ids,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )
\n\n\ndef _load_assets_from_dbt_manifest(\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n io_manager_key: Optional[str],\n dagster_dbt_translator: Optional[DagsterDbtTranslator],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n selected_unique_ids: Optional[AbstractSet[str]],\n display_raw_sql: Optional[bool],\n dbt_resource_key: str,\n op_name: Optional[str],\n use_build_command: bool,\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n) -> Sequence[AssetsDefinition]:\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n dbt_resource_key = check.str_param(dbt_resource_key, "dbt_resource_key")\n\n dbt_nodes = {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["metrics"],\n **manifest["exposures"],\n }\n\n if selected_unique_ids:\n select = (\n " ".join(".".join(dbt_nodes[uid]["fqn"]) for uid in selected_unique_ids)\n if select is None\n else select\n )\n exclude = "" if exclude is None else exclude\n else:\n select = select if select is not None else "fqn:*"\n exclude = exclude if exclude is not None else ""\n\n selected_unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude, manifest_json=manifest\n )\n if len(selected_unique_ids) == 0:\n raise DagsterInvalidSubsetError(f"No dbt models match the selection string '{select}'.")\n\n if dagster_dbt_translator is not None:\n check.invariant(\n node_info_to_asset_key == default_asset_key_fn,\n "Can't specify both dagster_dbt_translator and node_info_to_asset_key",\n )\n check.invariant(\n key_prefix is None,\n "Can't specify both dagster_dbt_translator and key_prefix",\n )\n check.invariant(\n source_key_prefix is None,\n "Can't specify both dagster_dbt_translator and source_key_prefix",\n )\n check.invariant(\n node_info_to_group_fn == default_group_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_group_fn",\n )\n check.invariant(\n display_raw_sql is None,\n "Can't specify both dagster_dbt_translator and display_raw_sql",\n )\n check.invariant(\n node_info_to_definition_metadata_fn is default_metadata_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_definition_metadata_fn",\n )\n else:\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props):\n base_key = node_info_to_asset_key(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(source_key_prefix or [])\n else:\n return base_key.with_prefix(key_prefix or [])\n\n @classmethod\n def get_metadata(cls, dbt_resource_props):\n return node_info_to_definition_metadata_fn(dbt_resource_props)\n\n @classmethod\n def get_description(cls, dbt_resource_props):\n return default_description_fn(\n dbt_resource_props,\n display_raw_sql=display_raw_sql if display_raw_sql is not None else True,\n )\n\n @classmethod\n def get_group_name(cls, dbt_resource_props):\n return node_info_to_group_fn(dbt_resource_props)\n\n @classmethod\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n return node_info_to_freshness_policy_fn(dbt_resource_props)\n\n @classmethod\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n return node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n dagster_dbt_translator = CustomDagsterDbtTranslator()\n\n dbt_assets_def = _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n exclude=exclude,\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n project_id=manifest["metadata"]["project_id"][:5],\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n dagster_dbt_translator=dagster_dbt_translator,\n manifest_json=manifest,\n )\n\n return [dbt_assets_def]\n\n\ndef _raise_warnings_for_deprecated_args(\n public_fn_name: str,\n selected_unique_ids: Optional[AbstractSet[str]],\n dbt_resource_key: Optional[str],\n use_build_command: Optional[bool],\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n):\n if node_info_to_asset_key != default_asset_key_fn:\n deprecation_warning(\n f"The node_info_to_asset_key_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_asset_key.",\n stacklevel=4,\n )\n\n if node_info_to_group_fn != default_group_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_group_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure dagster groups on a dbt resource's meta field or assign dbt"\n " groups or provide a custom DagsterDbtTranslator that overrides get_group_name.",\n stacklevel=4,\n )\n\n if node_info_to_auto_materialize_policy_fn != default_auto_materialize_policy_fn:\n deprecation_warning(\n f"The node_info_to_auto_materialize_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster auto-materialize policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_freshness_policy_fn != default_freshness_policy_fn:\n deprecation_warning(\n f"The node_info_to_freshness_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster freshness policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_definition_metadata_fn != default_metadata_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_definition_metadata_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_metadata.",\n stacklevel=4,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_defs"}, "asset_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_utils

\nimport hashlib\nimport textwrap\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetsDefinition,\n    AssetSelection,\n    AutoMaterializePolicy,\n    DagsterInvariantViolationError,\n    FreshnessPolicy,\n    In,\n    MetadataValue,\n    Nothing,\n    Out,\n    RunConfig,\n    ScheduleDefinition,\n    TableColumn,\n    TableSchema,\n    _check as check,\n    define_asset_job,\n)\nfrom dagster._core.definitions.decorators.asset_decorator import (\n    _validate_and_assign_output_names_to_check_specs,\n)\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import deprecation_warning\n\nfrom .utils import input_name_fn, output_name_fn\n\nif TYPE_CHECKING:\n    from .dagster_dbt_translator import (\n        DagsterDbtTranslator,\n        DagsterDbtTranslatorSettings,\n        DbtManifestWrapper,\n    )\n\nMANIFEST_METADATA_KEY = "dagster_dbt/manifest"\nDAGSTER_DBT_TRANSLATOR_METADATA_KEY = "dagster_dbt/dagster_dbt_translator"\n\n\n
[docs]def get_asset_key_for_model(dbt_assets: Sequence[AssetsDefinition], model_name: str) -> AssetKey:\n """Return the corresponding Dagster asset key for a dbt model.\n\n Args:\n dbt_assets (AssetsDefinition): An AssetsDefinition object produced by\n load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.\n model_name (str): The name of the dbt model.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_model\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n\n @asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\n def cleaned_customers():\n ...\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(model_name, "model_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_models = [\n value\n for value in manifest["nodes"].values()\n if value["name"] == model_name and value["resource_type"] == "model"\n ]\n\n if len(matching_models) == 0:\n raise KeyError(f"Could not find a dbt model with name: {model_name}")\n\n return dagster_dbt_translator.get_asset_key(next(iter(matching_models)))
\n\n\n
[docs]def get_asset_keys_by_output_name_for_source(\n dbt_assets: Sequence[AssetsDefinition], source_name: str\n) -> Mapping[str, AssetKey]:\n """Returns the corresponding Dagster asset keys for all tables in a dbt source.\n\n This is a convenience method that makes it easy to define a multi-asset that generates\n all the tables for a given dbt source.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Returns:\n Mapping[str, AssetKey]: A mapping of the table name to corresponding Dagster asset key\n for all tables in the given dbt source.\n\n Examples:\n .. code-block:: python\n\n from dagster import AssetOut, multi_asset\n from dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @multi_asset(\n outs={\n name: AssetOut(key=asset_key)\n for name, asset_key in get_asset_keys_by_output_name_for_source(\n [all_dbt_assets], "raw_data"\n ).items()\n },\n )\n def upstream_python_asset():\n ...\n\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(source_name, "source_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_nodes = [\n value for value in manifest["sources"].values() if value["source_name"] == source_name\n ]\n\n if len(matching_nodes) == 0:\n raise KeyError(f"Could not find a dbt source with name: {source_name}")\n\n return {\n output_name_fn(value): dagster_dbt_translator.get_asset_key(value)\n for value in matching_nodes\n }
\n\n\n
[docs]def get_asset_key_for_source(dbt_assets: Sequence[AssetsDefinition], source_name: str) -> AssetKey:\n """Returns the corresponding Dagster asset key for a dbt source with a singular table.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Raises:\n DagsterInvalidInvocationError: If the source has more than one table.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\n def upstream_python_asset():\n ...\n """\n asset_keys_by_output_name = get_asset_keys_by_output_name_for_source(dbt_assets, source_name)\n\n if len(asset_keys_by_output_name) > 1:\n raise KeyError(\n f"Source {source_name} has more than one table:"\n f" {asset_keys_by_output_name.values()}. Use"\n " `get_asset_keys_by_output_name_for_source` instead to get all tables for a"\n " source."\n )\n\n return next(iter(asset_keys_by_output_name.values()))
\n\n\n
[docs]def build_dbt_asset_selection(\n dbt_assets: Sequence[AssetsDefinition],\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n) -> AssetSelection:\n """Build an asset selection for a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Returns:\n AssetSelection: An asset selection for the selected dbt nodes.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n # Select the dbt assets that have the tag "foo".\n foo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n # Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n # of them (dbt-related or otherwise)\n foo_and_downstream_selection = foo_selection.downstream()\n\n """\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n from .dbt_manifest_asset_selection import DbtManifestAssetSelection\n\n return DbtManifestAssetSelection(\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n select=dbt_select,\n exclude=dbt_exclude,\n )
\n\n\n
[docs]def build_schedule_from_dbt_selection(\n dbt_assets: Sequence[AssetsDefinition],\n job_name: str,\n cron_schedule: str,\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n config: Optional[RunConfig] = None,\n execution_timezone: Optional[str] = None,\n) -> ScheduleDefinition:\n """Build a schedule to materialize a specified set of dbt resources from a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n job_name (str): The name of the job to materialize the dbt resources.\n cron_schedule (str): The cron schedule to define the schedule.\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n config (Optional[RunConfig]): The config that parameterizes the execution of this schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n\n Returns:\n ScheduleDefinition: A definition to materialize the selected dbt resources on a cron schedule.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n daily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n [all_dbt_assets],\n job_name="all_dbt_assets",\n cron_schedule="0 0 * * *",\n dbt_select="fqn:*",\n )\n """\n return ScheduleDefinition(\n cron_schedule=cron_schedule,\n job=define_asset_job(\n name=job_name,\n selection=build_dbt_asset_selection(\n dbt_assets,\n dbt_select=dbt_select,\n dbt_exclude=dbt_exclude,\n ),\n config=config,\n tags=tags,\n ),\n execution_timezone=execution_timezone,\n )
\n\n\ndef get_manifest_and_translator_from_dbt_assets(\n dbt_assets: Sequence[AssetsDefinition],\n) -> Tuple[Mapping[str, Any], "DagsterDbtTranslator"]:\n check.invariant(len(dbt_assets) == 1, "Exactly one dbt AssetsDefinition is required")\n dbt_assets_def = dbt_assets[0]\n metadata_by_key = dbt_assets_def.metadata_by_key or {}\n first_asset_key = next(iter(dbt_assets_def.metadata_by_key.keys()))\n first_metadata = metadata_by_key.get(first_asset_key, {})\n manifest_wrapper: Optional["DbtManifestWrapper"] = first_metadata.get(MANIFEST_METADATA_KEY)\n if manifest_wrapper is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt manifest metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n dagster_dbt_translator = first_metadata.get(DAGSTER_DBT_TRANSLATOR_METADATA_KEY)\n if dagster_dbt_translator is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt translator metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n return manifest_wrapper.manifest, dagster_dbt_translator\n\n\n###################\n# DEFAULT FUNCTIONS\n###################\n\n\ndef default_asset_key_fn(dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """Get the asset key for a dbt node.\n\n By default, if the dbt node has a Dagster asset key configured in its metadata, then that is\n parsed and used.\n\n Otherwise:\n dbt sources: a dbt source's key is the union of its source name and its table name\n dbt models: a dbt model's key is the union of its model name and any schema configured on\n the model itself.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n asset_key_config = dagster_metadata.get("asset_key", [])\n if asset_key_config:\n return AssetKey(asset_key_config)\n\n if dbt_resource_props["resource_type"] == "source":\n components = [dbt_resource_props["source_name"], dbt_resource_props["name"]]\n else:\n configured_schema = dbt_resource_props["config"].get("schema")\n if configured_schema is not None:\n components = [configured_schema, dbt_resource_props["name"]]\n else:\n components = [dbt_resource_props["name"]]\n\n return AssetKey(components)\n\n\n
[docs]def default_metadata_from_dbt_resource_props(\n dbt_resource_props: Mapping[str, Any]\n) -> Mapping[str, Any]:\n metadata: Dict[str, Any] = {}\n columns = dbt_resource_props.get("columns", {})\n if len(columns) > 0:\n metadata["table_schema"] = MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(\n name=column_name,\n type=column_info.get("data_type") or "?",\n description=column_info.get("description"),\n )\n for column_name, column_info in columns.items()\n ]\n )\n )\n return metadata
\n\n\n
[docs]def default_group_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """Get the group name for a dbt node.\n\n If a Dagster group is configured in the metadata for the node, use that.\n\n Otherwise, if a dbt group is configured for the node, use that.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n\n dagster_group = dagster_metadata.get("group")\n if dagster_group:\n return dagster_group\n\n dbt_group = dbt_resource_props.get("config", {}).get("group")\n if dbt_group:\n return dbt_group\n\n return None
\n\n\n
[docs]def group_from_dbt_resource_props_fallback_to_directory(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[str]:\n """Get the group name for a dbt node.\n\n Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\n from config or metadata, falls back to using the subdirectory of the models directory that the\n source file is in.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\n dbt_assets = load_assets_from_dbt_manifest(\n manifest=manifest,\n node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n )\n """\n group_name = default_group_from_dbt_resource_props(dbt_resource_props)\n if group_name is not None:\n return group_name\n\n fqn = dbt_resource_props.get("fqn", [])\n # the first component is the package name, and the last component is the model name\n if len(fqn) < 3:\n return None\n return fqn[1]
\n\n\ndef default_freshness_policy_fn(dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n freshness_policy_config = dagster_metadata.get("freshness_policy", {})\n\n freshness_policy = _legacy_freshness_policy_fn(freshness_policy_config)\n if freshness_policy:\n return freshness_policy\n\n legacy_freshness_policy_config = dbt_resource_props["config"].get(\n "dagster_freshness_policy", {}\n )\n legacy_freshness_policy = _legacy_freshness_policy_fn(legacy_freshness_policy_config)\n\n if legacy_freshness_policy:\n deprecation_warning(\n "dagster_freshness_policy",\n "0.21.0",\n "Instead, configure a Dagster freshness policy on a dbt model using"\n " +meta.dagster.freshness_policy.",\n )\n\n return legacy_freshness_policy\n\n\ndef _legacy_freshness_policy_fn(\n freshness_policy_config: Mapping[str, Any]\n) -> Optional[FreshnessPolicy]:\n if freshness_policy_config:\n return FreshnessPolicy(\n maximum_lag_minutes=float(freshness_policy_config["maximum_lag_minutes"]),\n cron_schedule=freshness_policy_config.get("cron_schedule"),\n cron_schedule_timezone=freshness_policy_config.get("cron_schedule_timezone"),\n )\n return None\n\n\ndef default_auto_materialize_policy_fn(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n auto_materialize_policy_config = dagster_metadata.get("auto_materialize_policy", {})\n\n auto_materialize_policy = _auto_materialize_policy_fn(auto_materialize_policy_config)\n if auto_materialize_policy:\n return auto_materialize_policy\n\n legacy_auto_materialize_policy_config = dbt_resource_props["config"].get(\n "dagster_auto_materialize_policy", {}\n )\n legacy_auto_materialize_policy = _auto_materialize_policy_fn(\n legacy_auto_materialize_policy_config\n )\n\n if legacy_auto_materialize_policy:\n deprecation_warning(\n "dagster_auto_materialize_policy",\n "0.21.0",\n "Instead, configure a Dagster auto-materialize policy on a dbt model using"\n " +meta.dagster.auto_materialize_policy.",\n )\n\n return legacy_auto_materialize_policy\n\n\ndef _auto_materialize_policy_fn(\n auto_materialize_policy_config: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n if auto_materialize_policy_config.get("type") == "eager":\n return AutoMaterializePolicy.eager()\n elif auto_materialize_policy_config.get("type") == "lazy":\n return AutoMaterializePolicy.lazy()\n return None\n\n\ndef default_description_fn(dbt_resource_props: Mapping[str, Any], display_raw_sql: bool = True):\n code_block = textwrap.indent(\n dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", ""), " "\n )\n description_sections = [\n dbt_resource_props["description"]\n or f"dbt {dbt_resource_props['resource_type']} {dbt_resource_props['name']}",\n ]\n if display_raw_sql:\n description_sections.append(f"#### Raw SQL:\\n```\\n{code_block}\\n```")\n return "\\n\\n".join(filter(None, description_sections))\n\n\ndef is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> bool:\n attached_node_unique_id = dbt_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n return is_generic_test and attached_node_unique_id == unique_id\n\n\ndef default_asset_check_fn(\n asset_key: AssetKey,\n unique_id: str,\n dagster_dbt_translator_settings: "DagsterDbtTranslatorSettings",\n dbt_resource_props: Mapping[str, Any],\n) -> Optional[AssetCheckSpec]:\n is_generic_test_on_attached_node = is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id, dbt_resource_props\n )\n\n if not all(\n [\n dagster_dbt_translator_settings.enable_asset_checks,\n is_generic_test_on_attached_node,\n ]\n ):\n return None\n\n return AssetCheckSpec(\n name=dbt_resource_props["name"],\n asset=asset_key,\n description=dbt_resource_props["description"],\n )\n\n\ndef default_code_version_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n return hashlib.sha1(\n (dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", "")).encode(\n "utf-8"\n )\n ).hexdigest()\n\n\n###################\n# DEPENDENCIES\n###################\n\n\ndef is_non_asset_node(dbt_resource_props: Mapping[str, Any]):\n # some nodes exist inside the dbt graph but are not assets\n resource_type = dbt_resource_props["resource_type"]\n if resource_type == "metric":\n return True\n if (\n resource_type == "model"\n and dbt_resource_props.get("config", {}).get("materialized") == "ephemeral"\n ):\n return True\n return False\n\n\ndef get_deps(\n dbt_nodes: Mapping[str, Any],\n selected_unique_ids: AbstractSet[str],\n asset_resource_types: List[str],\n) -> Mapping[str, FrozenSet[str]]:\n def _valid_parent_node(dbt_resource_props):\n # sources are valid parents, but not assets\n return dbt_resource_props["resource_type"] in asset_resource_types + ["source"]\n\n asset_deps: Dict[str, Set[str]] = {}\n for unique_id in selected_unique_ids:\n dbt_resource_props = dbt_nodes[unique_id]\n node_resource_type = dbt_resource_props["resource_type"]\n\n # skip non-assets, such as metrics, tests, and ephemeral models\n if is_non_asset_node(dbt_resource_props) or node_resource_type not in asset_resource_types:\n continue\n\n asset_deps[unique_id] = set()\n for parent_unique_id in dbt_resource_props.get("depends_on", {}).get("nodes", []):\n parent_node_info = dbt_nodes[parent_unique_id]\n # for metrics or ephemeral dbt models, BFS to find valid parents\n if is_non_asset_node(parent_node_info):\n visited = set()\n replaced_parent_ids = set()\n # make a copy to avoid mutating the actual dictionary\n queue = list(parent_node_info.get("depends_on", {}).get("nodes", []))\n while queue:\n candidate_parent_id = queue.pop()\n if candidate_parent_id in visited:\n continue\n visited.add(candidate_parent_id)\n\n candidate_parent_info = dbt_nodes[candidate_parent_id]\n if is_non_asset_node(candidate_parent_info):\n queue.extend(candidate_parent_info.get("depends_on", {}).get("nodes", []))\n elif _valid_parent_node(candidate_parent_info):\n replaced_parent_ids.add(candidate_parent_id)\n\n asset_deps[unique_id] |= replaced_parent_ids\n # ignore nodes which are not assets / sources\n elif _valid_parent_node(parent_node_info):\n asset_deps[unique_id].add(parent_unique_id)\n\n frozen_asset_deps = {\n unique_id: frozenset(parent_ids) for unique_id, parent_ids in asset_deps.items()\n }\n\n return frozen_asset_deps\n\n\ndef get_asset_deps(\n dbt_nodes,\n deps,\n io_manager_key,\n manifest: Optional[Mapping[str, Any]],\n dagster_dbt_translator: "DagsterDbtTranslator",\n) -> Tuple[\n Dict[AssetKey, Set[AssetKey]],\n Dict[AssetKey, Tuple[str, In]],\n Dict[AssetKey, Tuple[str, Out]],\n Dict[AssetKey, str],\n Dict[AssetKey, FreshnessPolicy],\n Dict[AssetKey, AutoMaterializePolicy],\n Dict[str, AssetCheckSpec],\n Dict[str, List[str]],\n Dict[str, Dict[str, Any]],\n]:\n from .dagster_dbt_translator import DbtManifestWrapper\n\n asset_deps: Dict[AssetKey, Set[AssetKey]] = {}\n asset_ins: Dict[AssetKey, Tuple[str, In]] = {}\n asset_outs: Dict[AssetKey, Tuple[str, Out]] = {}\n\n # These dicts could be refactored as a single dict, mapping from output name to arbitrary\n # metadata that we need to store for reference.\n group_names_by_key: Dict[AssetKey, str] = {}\n freshness_policies_by_key: Dict[AssetKey, FreshnessPolicy] = {}\n auto_materialize_policies_by_key: Dict[AssetKey, AutoMaterializePolicy] = {}\n check_specs: List[AssetCheckSpec] = []\n fqns_by_output_name: Dict[str, List[str]] = {}\n metadata_by_output_name: Dict[str, Dict[str, Any]] = {}\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n fqns_by_output_name[output_name] = dbt_resource_props["fqn"]\n\n metadata_by_output_name[output_name] = {\n key: dbt_resource_props[key] for key in ["unique_id", "resource_type"]\n }\n\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n asset_deps[asset_key] = set()\n\n metadata = merge_dicts(\n dagster_dbt_translator.get_metadata(dbt_resource_props),\n {\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest) if manifest else None,\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n )\n asset_outs[asset_key] = (\n output_name,\n Out(\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n metadata=metadata,\n is_required=False,\n dagster_type=Nothing,\n code_version=default_code_version_fn(dbt_resource_props),\n ),\n )\n\n group_name = dagster_dbt_translator.get_group_name(dbt_resource_props)\n if group_name is not None:\n group_names_by_key[asset_key] = group_name\n\n freshness_policy = dagster_dbt_translator.get_freshness_policy(dbt_resource_props)\n if freshness_policy is not None:\n freshness_policies_by_key[asset_key] = freshness_policy\n\n auto_materialize_policy = dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n )\n if auto_materialize_policy is not None:\n auto_materialize_policies_by_key[asset_key] = auto_materialize_policy\n\n test_unique_ids = []\n if manifest:\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(\n asset_key, unique_id, dagster_dbt_translator.settings, test_resource_props\n )\n\n if check_spec:\n check_specs.append(check_spec)\n\n for parent_unique_id in parent_unique_ids:\n parent_node_info = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_node_info)\n\n asset_deps[asset_key].add(parent_asset_key)\n\n # if this parent is not one of the selected nodes, it's an input\n if parent_unique_id not in deps:\n input_name = input_name_fn(parent_node_info)\n asset_ins[parent_asset_key] = (input_name, In(Nothing))\n\n check_specs_by_output_name = cast(\n Dict[str, AssetCheckSpec],\n _validate_and_assign_output_names_to_check_specs(check_specs, list(asset_outs.keys())),\n )\n\n return (\n asset_deps,\n asset_ins,\n asset_outs,\n group_names_by_key,\n freshness_policies_by_key,\n auto_materialize_policies_by_key,\n check_specs_by_output_name,\n fqns_by_output_name,\n metadata_by_output_name,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_utils"}, "cloud": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.asset_defs

\nimport json\nimport shlex\nfrom argparse import Namespace\nfrom contextlib import suppress\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetExecutionContext,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    MetadataValue,\n    PartitionsDefinition,\n    ResourceDefinition,\n    multi_asset,\n    with_resources,\n)\nfrom dagster._annotations import experimental, experimental_param\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\n\nfrom ..errors import DagsterDbtCloudJobInvariantViolationError\nfrom ..utils import ASSET_RESOURCE_TYPES, result_to_events\nfrom .resources import DbtCloudClient, DbtCloudClientResource, DbtCloudRunStatus\n\nDAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR = "DBT_DAGSTER_COMPILE_RUN_ID"\n\n\nclass DbtCloudCacheableAssetsDefinition(CacheableAssetsDefinition):\n    def __init__(\n        self,\n        dbt_cloud_resource_def: Union[DbtCloudClientResource, ResourceDefinition],\n        job_id: int,\n        node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n        node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n        node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n        node_info_to_auto_materialize_policy_fn: Callable[\n            [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n        ],\n        partitions_def: Optional[PartitionsDefinition] = None,\n        partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n    ):\n        self._dbt_cloud_resource_def: ResourceDefinition = (\n            dbt_cloud_resource_def.get_resource_definition()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def\n        )\n\n        self._dbt_cloud: DbtCloudClient = (\n            dbt_cloud_resource_def.process_config_and_initialize().get_dbt_client()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def(build_init_resource_context())\n        )\n        self._job_id = job_id\n        self._project_id: int\n        self._has_generate_docs: bool\n        self._job_commands: List[str]\n        self._job_materialization_command_step: int\n        self._node_info_to_asset_key = node_info_to_asset_key\n        self._node_info_to_group_fn = node_info_to_group_fn\n        self._node_info_to_freshness_policy_fn = node_info_to_freshness_policy_fn\n        self._node_info_to_auto_materialize_policy_fn = node_info_to_auto_materialize_policy_fn\n        self._partitions_def = partitions_def\n        self._partition_key_to_vars_fn = partition_key_to_vars_fn\n\n        super().__init__(unique_id=f"dbt-cloud-{job_id}")\n\n    def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n        dbt_nodes, dbt_dependencies = self._get_dbt_nodes_and_dependencies()\n        return [self._build_dbt_cloud_assets_cacheable_data(dbt_nodes, dbt_dependencies)]\n\n    def build_definitions(\n        self, data: Sequence[AssetsDefinitionCacheableData]\n    ) -> Sequence[AssetsDefinition]:\n        return with_resources(\n            [\n                self._build_dbt_cloud_assets_from_cacheable_data(assets_definition_metadata)\n                for assets_definition_metadata in data\n            ],\n            {"dbt_cloud": self._dbt_cloud_resource_def},\n        )\n\n    @staticmethod\n    def parse_dbt_command(dbt_command: str) -> Namespace:\n        args = shlex.split(dbt_command)[1:]\n        try:\n            from dbt.cli.flags import (\n                Flags,\n                args_to_context,\n            )\n\n            # nasty hack to get dbt to parse the args\n            # dbt >= 1.5.0 requires that profiles-dir is set to an existing directory\n            return Namespace(**vars(Flags(args_to_context(args + ["--profiles-dir", "."]))))\n        except ImportError:\n            # dbt < 1.5.0 compat\n            from dbt.main import parse_args  # type: ignore\n\n            return parse_args(args=args)\n\n    @staticmethod\n    def get_job_materialization_command_step(execute_steps: List[str]) -> int:\n        materialization_command_filter = [\n            DbtCloudCacheableAssetsDefinition.parse_dbt_command(command).which in ["run", "build"]\n            for command in execute_steps\n        ]\n\n        if sum(materialization_command_filter) != 1:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                "The dbt Cloud job must have a single `dbt run` or `dbt build` in its commands. "\n                f"Received commands: {execute_steps}."\n            )\n\n        return materialization_command_filter.index(True)\n\n    @staticmethod\n    def get_compile_filters(parsed_args: Namespace) -> List[str]:\n        dbt_compile_options: List[str] = []\n\n        selected_models = parsed_args.select or []\n        if selected_models:\n            dbt_compile_options.append(f"--select {' '.join(selected_models)}")\n\n        excluded_models = parsed_args.exclude or []\n        if excluded_models:\n            dbt_compile_options.append(f"--exclude {' '.join(excluded_models)}")\n\n        selector = getattr(parsed_args, "selector_name", None) or getattr(\n            parsed_args, "selector", None\n        )\n        if selector:\n            dbt_compile_options.append(f"--selector {selector}")\n\n        return dbt_compile_options\n\n    def _get_cached_compile_dbt_cloud_job_run(self, compile_run_id: int) -> Tuple[int, int]:\n        # If the compile run is ongoing, allow it a grace period of 10 minutes to finish.\n        with suppress(Exception):\n            self._dbt_cloud.poll_run(run_id=compile_run_id, poll_timeout=600)\n\n        compile_run = self._dbt_cloud.get_run(\n            run_id=compile_run_id, include_related=["trigger", "run_steps"]\n        )\n\n        compile_run_status: str = compile_run["status_humanized"]\n        if compile_run_status != DbtCloudRunStatus.SUCCESS:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The cached dbt Cloud job run `{compile_run_id}` must have a status of"\n                f" `{DbtCloudRunStatus.SUCCESS}`. Received status: `{compile_run_status}. You can"\n                f" view the full status of your dbt Cloud run at {compile_run['href']}. Once it has"\n                " successfully completed, reload your Dagster definitions. If your run has failed,"\n                " you must manually refresh the cache using the `dagster-dbt"\n                " cache-compile-references` CLI."\n            )\n\n        compile_run_has_generate_docs = compile_run["trigger"]["generate_docs_override"]\n\n        compile_job_materialization_command_step = len(compile_run["run_steps"])\n        if compile_run_has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_id, compile_job_materialization_command_step\n\n    def _compile_dbt_cloud_job(self, dbt_cloud_job: Mapping[str, Any]) -> Tuple[int, int]:\n        # Retrieve the filters options from the dbt Cloud job's materialization command.\n        #\n        # There are three filters: `--select`, `--exclude`, and `--selector`.\n        materialization_command = self._job_commands[self._job_materialization_command_step]\n        parsed_args = DbtCloudCacheableAssetsDefinition.parse_dbt_command(materialization_command)\n        dbt_compile_options = DbtCloudCacheableAssetsDefinition.get_compile_filters(\n            parsed_args=parsed_args\n        )\n\n        # Add the partition variable as a variable to the dbt Cloud job command.\n        #\n        # If existing variables passed through the dbt Cloud job's command, an error will be\n        # raised. Since these are static variables anyways, they can be moved to the\n        # `dbt_project.yml` without loss of functionality.\n        #\n        # Since we're only doing this to generate the dependency structure, just use an arbitrary\n        # partition key (e.g. the last one) to retrieve the partition variable.\n        if parsed_args.vars and parsed_args.vars != "{}":\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{dbt_cloud_job['name']}' ({dbt_cloud_job['id']}) must not have"\n                " variables defined from `--vars` in its `dbt run` or `dbt build` command."\n                " Instead, declare the variables in the `dbt_project.yml` file. Received commands:"\n                f" {self._job_commands}."\n            )\n\n        if self._partitions_def and self._partition_key_to_vars_fn:\n            last_partition_key = self._partitions_def.get_last_partition_key()\n            if last_partition_key is None:\n                check.failed("PartitionsDefinition has no partitions")\n            partition_var = self._partition_key_to_vars_fn(last_partition_key)\n\n            dbt_compile_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n        # We need to retrieve the dependency structure for the assets in the dbt Cloud project.\n        # However, we can't just use the dependency structure from the latest run, because\n        # this historical structure may not be up-to-date with the current state of the project.\n        #\n        # By always doing a compile step, we can always get the latest dependency structure.\n        # This incurs some latency, but at least it doesn't run through the entire materialization\n        # process.\n        dbt_compile_command = f"dbt compile {' '.join(dbt_compile_options)}"\n        compile_run_dbt_output = self._dbt_cloud.run_job_and_poll(\n            job_id=self._job_id,\n            cause="Generating software-defined assets for Dagster.",\n            steps_override=[dbt_compile_command],\n        )\n\n        # Target the compile execution step when retrieving run artifacts, rather than assuming\n        # that the last step is the correct target.\n        #\n        # Here, we ignore the `dbt docs generate` step.\n        compile_job_materialization_command_step = len(\n            compile_run_dbt_output.run_details.get("run_steps", [])\n        )\n        if self._has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_dbt_output.run_id, compile_job_materialization_command_step\n\n    def _get_dbt_nodes_and_dependencies(\n        self,\n    ) -> Tuple[Mapping[str, Any], Mapping[str, FrozenSet[str]]]:\n        """For a given dbt Cloud job, fetch the latest run's dependency structure of executed nodes."""\n        # Fetch information about the job.\n        job = self._dbt_cloud.get_job(job_id=self._job_id)\n        self._project_id = job["project_id"]\n        self._has_generate_docs = job["generate_docs"]\n\n        # We constraint the kinds of dbt Cloud jobs that we support running.\n        #\n        # A simple constraint is that we only support jobs that run multiple steps,\n        # but it must contain one of either `dbt run` or `dbt build`.\n        #\n        # As a reminder, `dbt deps` is automatically run before the job's configured commands.\n        # And if the settings are enabled, `dbt docs generate` and `dbt source freshness` can\n        # automatically run after the job's configured commands.\n        #\n        # These commands that execute before and after the job's configured commands do not count\n        # towards the single command constraint.\n        self._job_commands = job["execute_steps"]\n        self._job_materialization_command_step = (\n            DbtCloudCacheableAssetsDefinition.get_job_materialization_command_step(\n                execute_steps=self._job_commands\n            )\n        )\n\n        # Determine whether to use a cached compile run. This should only be set up if the user is\n        # using a GitHub action along with their dbt project.\n        dbt_cloud_job_env_vars = self._dbt_cloud.get_job_environment_variables(\n            project_id=self._project_id, job_id=self._job_id\n        )\n        compile_run_id = (\n            dbt_cloud_job_env_vars.get(DAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR, {})\n            .get("job", {})\n            .get("value")\n        )\n\n        compile_run_id, compile_job_materialization_command_step = (\n            # If a compile run is cached, then use it.\n            self._get_cached_compile_dbt_cloud_job_run(compile_run_id=int(compile_run_id))\n            if compile_run_id\n            # Otherwise, compile the dbt Cloud project in an ad-hoc manner.\n            else self._compile_dbt_cloud_job(dbt_cloud_job=job)\n        )\n\n        manifest_json = self._dbt_cloud.get_manifest(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n        run_results_json = self._dbt_cloud.get_run_results(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n\n        # Filter the manifest to only include the nodes that were executed.\n        dbt_nodes: Dict[str, Any] = {\n            **manifest_json.get("nodes", {}),\n            **manifest_json.get("sources", {}),\n            **manifest_json.get("metrics", {}),\n        }\n        executed_node_ids: Set[str] = set(\n            result["unique_id"] for result in run_results_json["results"]\n        )\n\n        # If there are no executed nodes, then there are no assets to generate.\n        # Inform the user to inspect their dbt Cloud job's command.\n        if not executed_node_ids:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{job['name']}' ({job['id']}) does not generate any "\n                "software-defined assets. Ensure that your dbt project has nodes to execute, "\n                "and that your dbt Cloud job's materialization command has the proper filter "\n                f"options applied. Received commands: {self._job_commands}."\n            )\n\n        # Generate the dependency structure for the executed nodes.\n        dbt_dependencies = get_deps(\n            dbt_nodes=dbt_nodes,\n            selected_unique_ids=executed_node_ids,\n            asset_resource_types=ASSET_RESOURCE_TYPES,\n        )\n\n        return dbt_nodes, dbt_dependencies\n\n    def _build_dbt_cloud_assets_cacheable_data(\n        self, dbt_nodes: Mapping[str, Any], dbt_dependencies: Mapping[str, FrozenSet[str]]\n    ) -> AssetsDefinitionCacheableData:\n        """Given all of the nodes and dependencies for a dbt Cloud job, build the cacheable\n        representation that generate the asset definition for the job.\n        """\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props):\n                return self._node_info_to_asset_key(dbt_resource_props)\n\n            @classmethod\n            def get_description(cls, dbt_resource_props):\n                # We shouldn't display the raw sql. Instead, inspect if dbt docs were generated,\n                # and attach metadata to link to the docs.\n                return default_description_fn(dbt_resource_props, display_raw_sql=False)\n\n            @classmethod\n            def get_group_name(cls, dbt_resource_props):\n                return self._node_info_to_group_fn(dbt_resource_props)\n\n            @classmethod\n            def get_freshness_policy(cls, dbt_resource_props):\n                return self._node_info_to_freshness_policy_fn(dbt_resource_props)\n\n            @classmethod\n            def get_auto_materialize_policy(cls, dbt_resource_props):\n                return self._node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n        (\n            asset_deps,\n            asset_ins,\n            asset_outs,\n            group_names_by_key,\n            freshness_policies_by_key,\n            auto_materialize_policies_by_key,\n            _,\n            fqns_by_output_name,\n            metadata_by_output_name,\n        ) = get_asset_deps(\n            dbt_nodes=dbt_nodes,\n            deps=dbt_dependencies,\n            # TODO: In the future, allow the IO manager to be specified.\n            io_manager_key=None,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n            manifest=None,\n        )\n\n        return AssetsDefinitionCacheableData(\n            # TODO: In the future, we should allow additional upstream assets to be specified.\n            keys_by_input_name={\n                input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n            },\n            keys_by_output_name={\n                output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n            },\n            internal_asset_deps={\n                asset_outs[asset_key][0]: asset_deps for asset_key, asset_deps in asset_deps.items()\n            },\n            # We don't rely on a static group name. Instead, we map over the dbt metadata to\n            # determine the group name for each asset.\n            group_name=None,\n            metadata_by_output_name={\n                output_name: self._build_dbt_cloud_assets_metadata(dbt_metadata)\n                for output_name, dbt_metadata in metadata_by_output_name.items()\n            },\n            # TODO: In the future, we should allow the key prefix to be specified.\n            key_prefix=None,\n            can_subset=True,\n            extra_metadata={\n                "job_id": self._job_id,\n                "job_commands": self._job_commands,\n                "job_materialization_command_step": self._job_materialization_command_step,\n                "group_names_by_output_name": {\n                    asset_outs[asset_key][0]: group_name\n                    for asset_key, group_name in group_names_by_key.items()\n                },\n                "fqns_by_output_name": fqns_by_output_name,\n            },\n            freshness_policies_by_output_name={\n                asset_outs[asset_key][0]: freshness_policy\n                for asset_key, freshness_policy in freshness_policies_by_key.items()\n            },\n            auto_materialize_policies_by_output_name={\n                asset_outs[asset_key][0]: auto_materialize_policy\n                for asset_key, auto_materialize_policy in auto_materialize_policies_by_key.items()\n            },\n        )\n\n    def _build_dbt_cloud_assets_metadata(self, dbt_metadata: Dict[str, Any]) -> MetadataUserInput:\n        metadata = {\n            "dbt Cloud Job": MetadataValue.url(\n                self._dbt_cloud.build_url_for_job(\n                    project_id=self._project_id,\n                    job_id=self._job_id,\n                )\n            ),\n        }\n\n        if self._has_generate_docs:\n            metadata["dbt Cloud Documentation"] = MetadataValue.url(\n                self._dbt_cloud.build_url_for_cloud_docs(\n                    job_id=self._job_id,\n                    resource_type=dbt_metadata["resource_type"],\n                    unique_id=dbt_metadata["unique_id"],\n                )\n            )\n\n        return metadata\n\n    def _build_dbt_cloud_assets_from_cacheable_data(\n        self, assets_definition_cacheable_data: AssetsDefinitionCacheableData\n    ) -> AssetsDefinition:\n        metadata = cast(Mapping[str, Any], assets_definition_cacheable_data.extra_metadata)\n        job_id = cast(int, metadata["job_id"])\n        job_commands = cast(List[str], list(metadata["job_commands"]))\n        job_materialization_command_step = cast(int, metadata["job_materialization_command_step"])\n        group_names_by_output_name = cast(Mapping[str, str], metadata["group_names_by_output_name"])\n        fqns_by_output_name = cast(Mapping[str, List[str]], metadata["fqns_by_output_name"])\n\n        @multi_asset(\n            name=f"dbt_cloud_job_{job_id}",\n            deps=list((assets_definition_cacheable_data.keys_by_input_name or {}).values()),\n            outs={\n                output_name: AssetOut(\n                    key=asset_key,\n                    group_name=group_names_by_output_name.get(output_name),\n                    freshness_policy=(\n                        assets_definition_cacheable_data.freshness_policies_by_output_name or {}\n                    ).get(\n                        output_name,\n                    ),\n                    auto_materialize_policy=(\n                        assets_definition_cacheable_data.auto_materialize_policies_by_output_name\n                        or {}\n                    ).get(\n                        output_name,\n                    ),\n                    metadata=(assets_definition_cacheable_data.metadata_by_output_name or {}).get(\n                        output_name\n                    ),\n                    is_required=False,\n                )\n                for output_name, asset_key in (\n                    assets_definition_cacheable_data.keys_by_output_name or {}\n                ).items()\n            },\n            internal_asset_deps={\n                output_name: set(asset_deps)\n                for output_name, asset_deps in (\n                    assets_definition_cacheable_data.internal_asset_deps or {}\n                ).items()\n            },\n            partitions_def=self._partitions_def,\n            can_subset=assets_definition_cacheable_data.can_subset,\n            required_resource_keys={"dbt_cloud"},\n            compute_kind="dbt",\n        )\n        def _assets(context: AssetExecutionContext):\n            dbt_cloud = cast(DbtCloudClient, context.resources.dbt_cloud)\n\n            # Add the partition variable as a variable to the dbt Cloud job command.\n            dbt_options: List[str] = []\n            if context.has_partition_key and self._partition_key_to_vars_fn:\n                partition_var = self._partition_key_to_vars_fn(context.partition_key)\n\n                dbt_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n            # Prepare the materialization step to be overriden with the selection filter\n            materialization_command = job_commands[job_materialization_command_step]\n\n            # Map the selected outputs to dbt models that should be materialized.\n            #\n            # HACK: This selection filter works even if an existing `--select` is specified in the\n            # dbt Cloud job. We take advantage of the fact that the last `--select` will be used.\n            #\n            # This is not ideal, as the triggered run for the dbt Cloud job will still have both\n            # `--select` options when displayed in the UI, but parsing the command line argument\n            # to remove the initial select using argparse.\n            if len(context.selected_output_names) != len(\n                assets_definition_cacheable_data.keys_by_output_name or {}\n            ):\n                selected_models = [\n                    ".".join(fqns_by_output_name[output_name])\n                    for output_name in context.selected_output_names\n                ]\n\n                dbt_options.append(f"--select {' '.join(sorted(selected_models))}")\n\n                # If the `--selector` option is used, we need to remove it from the command, since\n                # it disables other selection options from being functional.\n                #\n                # See https://docs.getdbt.com/reference/node-selection/syntax for details.\n                split_materialization_command = shlex.split(materialization_command)\n                if "--selector" in split_materialization_command:\n                    idx = split_materialization_command.index("--selector")\n\n                    materialization_command = " ".join(\n                        split_materialization_command[:idx]\n                        + split_materialization_command[idx + 2 :]\n                    )\n\n            job_commands[job_materialization_command_step] = (\n                f"{materialization_command} {' '.join(dbt_options)}".strip()\n            )\n\n            # Run the dbt Cloud job to rematerialize the assets.\n            dbt_cloud_output = dbt_cloud.run_job_and_poll(\n                job_id=job_id,\n                cause=f"Materializing software-defined assets in Dagster run {context.run_id[:8]}",\n                steps_override=job_commands,\n            )\n\n            # Target the materialization step when retrieving run artifacts, rather than assuming\n            # that the last step is the correct target.\n            #\n            # We ignore the commands in front of the materialization command. And again, we ignore\n            # the `dbt docs generate` step.\n            materialization_command_step = len(dbt_cloud_output.run_details.get("run_steps", []))\n            materialization_command_step -= len(job_commands) - job_materialization_command_step - 1\n            if dbt_cloud_output.run_details.get("job", {}).get("generate_docs"):\n                materialization_command_step -= 1\n\n            # TODO: Assume the run completely fails or completely succeeds.\n            # In the future, we can relax this assumption.\n            manifest_json = dbt_cloud.get_manifest(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n            run_results_json = self._dbt_cloud.get_run_results(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n\n            for result in run_results_json.get("results", []):\n                yield from result_to_events(\n                    result=result,\n                    docs_url=dbt_cloud_output.docs_url,\n                    node_info_to_asset_key=self._node_info_to_asset_key,\n                    manifest_json=manifest_json,\n                    # TODO: In the future, allow arbitrary mappings to Dagster output metadata from\n                    # the dbt metadata.\n                    extra_metadata=None,\n                    generate_asset_outputs=True,\n                )\n\n        return _assets\n\n\n
[docs]@experimental\n@experimental_param(param="partitions_def")\n@experimental_param(param="partition_key_to_vars_fn")\ndef load_assets_from_dbt_cloud_job(\n dbt_cloud: ResourceDefinition,\n job_id: int,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n) -> CacheableAssetsDefinition:\n """Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\n determine the set of dbt models, the project is compiled to generate the necessary artifacts\n that define the dbt models and their dependencies.\n\n One Dagster asset is created for each dbt model.\n\n Args:\n dbt_cloud (ResourceDefinition): The dbt Cloud resource to use to connect to the dbt Cloud API.\n job_id (int): The ID of the dbt Cloud job to load assets from.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt metadata and returns the AssetKey that you want to represent a given model or\n source. By default: dbt model -> AssetKey([model_name]) and\n dbt source -> AssetKey([source_name, table_name])\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]):\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]):\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"})\n\n Returns:\n CacheableAssetsDefinition: A definition for the loaded assets.\n\n Examples:\n .. code-block:: python\n\n from dagster import repository\n from dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\n DBT_CLOUD_JOB_ID = 1234\n\n dbt_cloud = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n dbt_cloud_assets = load_assets_from_dbt_cloud_job(\n dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n )\n\n\n @repository\n def dbt_cloud_sandbox():\n return [dbt_cloud_assets]\n """\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n return DbtCloudCacheableAssetsDefinition(\n dbt_cloud_resource_def=dbt_cloud,\n job_id=job_id,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom typing import List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\nclass DbtCloudRunOpConfig(Config):\n    job_id: int = Field(\n        description=(\n            "The integer ID of the relevant dbt Cloud job. You can find this value by going to the"\n            " details page of your job in the dbt Cloud UI. It will be the final number in the url,"\n            " e.g.:    "\n            " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n        )\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes."\n        ),\n    )\n\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context, config: DbtCloudRunOpConfig):\n """Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = dbt_cloud_run_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n config.job_id, poll_interval=config.poll_interval, poll_timeout=config.poll_timeout\n )\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom enum import Enum\nfrom typing import Any, Mapping, Optional, Sequence, cast\nfrom urllib.parse import urlencode, urljoin\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    IAttachDifferentObjectToOpContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_API_V2_PATH = "api/v2/accounts/"\nDBT_API_V3_PATH = "api/v3/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\nclass DbtCloudRunStatus(str, Enum):\n    QUEUED = "Queued"\n    STARTING = "Starting"\n    RUNNING = "Running"\n    SUCCESS = "Success"\n    ERROR = "Error"\n    CANCELLED = "Cancelled"\n\n\n# TODO: This resource should be a wrapper over an existing client for a accessing dbt Cloud,\n# rather than using requests to the API directly.\nclass DbtCloudClient:\n    """This class exposes methods on top of the dbt Cloud REST API v2.\n\n    For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n    response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n    """\n\n    def __init__(\n        self,\n        auth_token: str,\n        account_id: int,\n        disable_schedule_on_trigger: bool = True,\n        request_max_retries: int = 3,\n        request_retry_delay: float = 0.25,\n        dbt_cloud_host: str = DBT_DEFAULT_HOST,\n        log: logging.Logger = get_dagster_logger(),\n        log_requests: bool = False,\n    ):\n        self._auth_token = auth_token\n        self._account_id = account_id\n        self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n        self._request_max_retries = request_max_retries\n        self._request_retry_delay = request_retry_delay\n\n        self._dbt_cloud_host = dbt_cloud_host\n        self._log = log\n        self._log_requests = log_requests\n\n    @property\n    def api_v2_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V2_PATH)\n\n    @property\n    def api_v3_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V3_PATH)\n\n    def build_url_for_job(self, project_id: int, job_id: int) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"next/deploy/{self._account_id}/projects/{project_id}/jobs/{job_id}/",\n        )\n\n    def build_url_for_cloud_docs(self, job_id: int, resource_type: str, unique_id: str) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"/accounts/{self._account_id}/jobs/{job_id}/docs/#!/{resource_type}/{unique_id}",\n        )\n\n    def make_request(\n        self,\n        method: str,\n        endpoint: str,\n        data: Optional[Mapping[str, Any]] = None,\n        params: Optional[Mapping[str, Any]] = None,\n        return_text: bool = False,\n        base_url: Optional[str] = None,\n    ) -> Any:\n        """Creates and sends a request to the desired dbt Cloud API endpoint.\n\n        Args:\n            method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n            endpoint (str): The dbt Cloud API endpoint to send this request to.\n            data (Optional[Mapping[str, Any]]): JSON-formatable data string to be included in the request.\n            params (Optional[Mapping[str, Any]]): Payload to add to query string of the request.\n            return_text (bool): Override default behavior and return unparsed {"text": response.text}\n                blob instead of json.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        headers = {\n            "User-Agent": f"dagster-dbt/{__version__}",\n            "Content-Type": "application/json",\n            "Authorization": f"Bearer {self._auth_token}",\n        }\n        base_url = base_url or self.api_v2_base_url\n        url = urljoin(base_url, endpoint)\n\n        if self._log_requests:\n            self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n        num_retries = 0\n        while True:\n            try:\n                response = requests.request(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    data=json.dumps(data),\n                    params=params,\n                )\n                response.raise_for_status()\n                return {"text": response.text} if return_text else response.json()["data"]\n            except RequestException as e:\n                self._log.error("Request to dbt Cloud API failed: %s", e)\n                if num_retries == self._request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self._request_retry_delay)\n\n        raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n    def list_jobs(\n        self, project_id: int, order_by: Optional[str] = "-id"\n    ) -> Sequence[Mapping[str, Any]]:\n        """List all dbt jobs in a dbt Cloud project.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n\n        Returns:\n            List[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/jobs",\n            params={"project_id": project_id, "order_by": order_by},\n        )\n\n    def get_job(self, job_id: int) -> Mapping[str, Any]:\n        """Gets details about a given dbt job from the dbt Cloud API.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")\n\n    def update_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Updates specific properties of a dbt job.\n\n        Documentation on the full set of potential parameters can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be changed.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n\n        Examples:\n        .. code-block:: python\n\n            # disable schedule for job with id=12345\n            my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n        """\n        # API requires you to supply a bunch of values, so we can just use the current state\n        # as the defaults\n        job_data = self.get_job(job_id)\n        return self.make_request(\n            "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n        )\n\n    def run_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Initializes a run for a job.\n\n        Overrides for specific properties can be set by passing in values to the kwargs. A full list\n        of overridable properties can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be overridden.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        self._log.info(f"Initializing run for job with job_id={job_id}")\n        if "cause" not in kwargs:\n            kwargs["cause"] = "Triggered via Dagster"\n        resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n\n        has_schedule: bool = resp.get("job", {}).get("triggers", {}).get("schedule", False)\n        if has_schedule and self._disable_schedule_on_trigger:\n            self._log.info("Disabling dbt Cloud job schedule.")\n            self.update_job(job_id, triggers={"schedule": False})\n\n        self._log.info(\n            f"Run initialized with run_id={resp['id']}. View this run in "\n            f"the dbt Cloud UI: {resp['href']}"\n        )\n        return resp\n\n    def get_runs(\n        self,\n        include_related: Optional[Sequence[str]] = None,\n        job_id: Optional[int] = None,\n        order_by: Optional[str] = "-id",\n        offset: int = 0,\n        limit: int = 100,\n    ) -> Sequence[Mapping[str, object]]:\n        """Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\n        using the job_definition_id. It supports pagination using offset and limit as well and\n        can be configured to load a variety of related information about the runs.\n\n        Args:\n            include_related (Optional[List[str]]): A list of resources to include in the response\n                from dbt Cloud. This is technically a required field according to the API, but it\n                can be passed with an empty list where it will only load the default run\n                information. Valid values are "trigger", "job", "repository", and "environment".\n            job_definition_id (Optional[int]): This method can be optionally filtered to only\n                load runs for a specific job id if it is included here. If omitted it will pull\n                runs for every job.\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n            offset (int): An offset to apply when listing runs. Can be used to paginate results\n                when combined with order_by and limit. Defaults to 0.\n            limit (int): Limits the amount of rows returned by the API. Defaults to 100.\n\n        Returns:\n            List[Dict[str, Any]]: A list of dictionaries containing the runs and any included\n                related information.\n        """\n        query_dict = {\n            "include_related": include_related or [],\n            "order_by": order_by,\n            "offset": offset,\n            "limit": limit,\n        }\n        if job_id:\n            query_dict["job_definition_id"] = job_id\n        return self.make_request("GET", f"{self._account_id}/runs/?{urlencode(query_dict)}")\n\n    def get_run(\n        self, run_id: int, include_related: Optional[Sequence[str]] = None\n    ) -> Mapping[str, Any]:\n        """Gets details about a specific job run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            include_related (List[str]): List of related fields to pull with the run. Valid values\n                are "trigger", "job", and "debug_logs".\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/{query_params}",\n        )\n\n    def get_run_steps(self, run_id: int) -> Sequence[str]:\n        """Gets the steps of an initialized dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            List[str, Any]: List of commands for each step of the run.\n        """\n        run_details = self.get_run(run_id, include_related=["trigger", "job"])\n        steps = run_details["job"]["execute_steps"]\n        steps_override = run_details["trigger"]["steps_override"]\n        return steps_override or steps\n\n    def cancel_run(self, run_id: int) -> Mapping[str, Any]:\n        """Cancels a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        self._log.info(f"Cancelling run with id '{run_id}'")\n        return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")\n\n    def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> Sequence[str]:\n        """Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run\n\n        Returns:\n            List[str]: List of the paths of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return cast(\n            list,\n            self.make_request(\n                "GET",\n                f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n                data={"step": step} if step else None,\n            ),\n        )\n\n    def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n        """The string contents of a run artifact from a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            List[str]: List of the names of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n            data={"step": step} if step else None,\n            return_text=True,\n        )["text"]\n\n    def get_manifest(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a manifest.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the manifest.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))\n\n    def get_run_results(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a run_results.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the run_results.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))\n\n    def poll_run(\n        self,\n        run_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        href: Optional[str] = None,\n    ) -> Mapping[str, Any]:\n        """Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n        run does not complete successfully.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n            href (str): For internal use, generally should not be set manually.\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        status: Optional[str] = None\n\n        if href is None:\n            href = self.get_run(run_id).get("href")\n        assert isinstance(href, str), "Run must have an href"\n\n        poll_start = datetime.datetime.now()\n        try:\n            while True:\n                run_details = self.get_run(run_id)\n                status = run_details["status_humanized"]\n                self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n                # completed successfully\n                if status == DbtCloudRunStatus.SUCCESS:\n                    return self.get_run(run_id, include_related=["job", "trigger", "run_steps"])\n                elif status in [DbtCloudRunStatus.ERROR, DbtCloudRunStatus.CANCELLED]:\n                    break\n                elif status not in [\n                    DbtCloudRunStatus.QUEUED,\n                    DbtCloudRunStatus.STARTING,\n                    DbtCloudRunStatus.RUNNING,\n                ]:\n                    check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n                if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n                    seconds=poll_timeout\n                ):\n                    self.cancel_run(run_id)\n                    raise Failure(\n                        f"Run {run_id} timed out after "\n                        f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n                        metadata={"run_page_url": MetadataValue.url(href)},\n                    )\n\n                # Sleep for the configured time interval before polling again.\n                time.sleep(poll_interval)\n        finally:\n            if status not in (\n                DbtCloudRunStatus.SUCCESS,\n                DbtCloudRunStatus.ERROR,\n                DbtCloudRunStatus.CANCELLED,\n            ):\n                self.cancel_run(run_id)\n\n        run_details = self.get_run(run_id, include_related=["trigger"])\n        raise Failure(\n            f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n            metadata={\n                "run_details": MetadataValue.json(run_details),\n                "run_page_url": MetadataValue.url(href),\n            },\n        )\n\n    def run_job_and_poll(\n        self,\n        job_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        **kwargs,\n    ) -> DbtCloudOutput:\n        """Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n        if the run does not complete successfully.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n\n        Returns:\n            :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n                parsed run results.\n        """\n        run_details = self.run_job(job_id, **kwargs)\n        run_id = run_details["id"]\n        href = run_details["href"]\n        final_run_details = self.poll_run(\n            run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n        )\n        try:\n            run_results = self.get_run_results(run_id)\n        # if you fail to get run_results for this job, just leave it empty\n        except Failure:\n            self._log.info(\n                "run_results.json not available for this run. Defaulting to empty value."\n            )\n            run_results = {}\n        output = DbtCloudOutput(run_details=final_run_details, result=run_results)\n        if output.docs_url:\n            self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n        return output\n\n    def get_job_environment_variables(self, project_id: int, job_id: int) -> Mapping[str, Any]:\n        """Get the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/projects/{project_id}/environment-variables/job",\n            params={"job_definition_id": job_id},\n            base_url=self.api_v3_base_url,\n        )\n\n    def set_job_environment_variable(\n        self, project_id: int, job_id: int, environment_variable_id: int, name: str, value: str\n    ) -> Mapping[str, Any]:\n        """Set the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            name (str): The name of the environment variable to set.\n            value (str): The raw value of the environment variable.\n        """\n        return self.make_request(\n            "POST",\n            f"{self._account_id}/projects/{project_id}/environment-variables/{environment_variable_id}",\n            data={\n                "id": environment_variable_id,\n                "account_id": self._account_id,\n                "project_id": project_id,\n                "job_definition_id": job_id,\n                "type": "job",\n                "name": name,\n                "raw_value": value,\n            },\n            base_url=self.api_v3_base_url,\n        )\n\n\nclass DbtCloudResource(DbtCloudClient):\n    pass\n\n\n
[docs]class DbtCloudClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """This resource helps interact with dbt Cloud connectors."""\n\n auth_token: str = Field(\n description=(\n "dbt Cloud API Token. User tokens can be found in the [dbt Cloud"\n " UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud"\n " Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for"\n " instructions on creating a Service Account token."\n ),\n )\n account_id: int = Field(\n description=(\n "dbt Cloud Account ID. This value can be found in the url of a variety of views in"\n " the dbt Cloud UI, e.g."\n " https://cloud.getdbt.com/#/accounts/{account_id}/settings/."\n ),\n )\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n dbt_cloud_host: str = Field(\n default=DBT_DEFAULT_HOST,\n description=(\n "The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/)."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_dbt_client(self) -> DbtCloudClient:\n context = self.get_resource_context()\n assert context.log\n\n return DbtCloudClient(\n auth_token=self.auth_token,\n account_id=self.account_id,\n disable_schedule_on_trigger=self.disable_schedule_on_trigger,\n request_max_retries=self.request_max_retries,\n request_retry_delay=self.request_retry_delay,\n log=context.log,\n dbt_cloud_host=self.dbt_cloud_host,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_dbt_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DbtCloudClientResource.to_config_schema(),\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResource:\n """This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResource(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n dbt_cloud_host=context.resource_config["dbt_cloud_host"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.resources"}}, "core": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources

\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._annotations import deprecated, public\nfrom dagster._config.pythonic_config import ConfigurableResource, IAttachDifferentObjectToOpContext\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom ..dbt_resource import DbtClient\nfrom .types import DbtCliOutput\nfrom .utils import (\n    DEFAULT_DBT_TARGET_PATH,\n    execute_cli,\n    execute_cli_stream,\n    parse_manifest,\n    parse_run_results,\n    remove_run_results,\n)\n\nDEFAULT_DBT_EXECUTABLE = "dbt"\n\n# The set of dbt cli commands that result in the creation of a run_results.json output file\n# https://docs.getdbt.com/reference/artifacts/run-results-json\nDBT_RUN_RESULTS_COMMANDS = ["run", "test", "seed", "snapshot", "docs generate", "build"]\n\n# The following config fields correspond to flags that apply to all dbt CLI commands. For details\n# on dbt CLI flags, see\n# https://github.com/fishtown-analytics/dbt/blob/1f8e29276e910c697588c43f08bc881379fff178/core/dbt/main.py#L260-L329\n\nCOMMON_OPTION_KEYS = {\n    "warn_error",\n    "dbt_executable",\n    "ignore_handled_error",\n    "target_path",\n    "docs_url",\n    "json_log_format",\n    "capture_logs",\n    "debug",\n}\n\n\nclass ConfigurableResourceWithCliFlags(ConfigurableResource):\n    project_dir: str = Field(\n        default=".",\n        description=(\n            "Which directory to look in for the dbt_project.yml file. Default is the current "\n            "working directory and its parents."\n        ),\n    )\n    profiles_dir: Optional[str] = Field(\n        default=None,\n        description=(\n            "Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or "\n            "$HOME/.dbt"\n        ),\n    )\n    profile: Optional[str] = Field(\n        default=None, description="Which profile to load. Overrides setting in dbt_project.yml."\n    )\n    target: Optional[str] = Field(\n        default=None, description="Which target to load for the given profile."\n    )\n    vars: Optional[Mapping[str, Any]] = Field(\n        default=None,\n        description=(\n            "Supply variables to the project. This argument overrides variables defined in your "\n            "dbt_project.yml file. This argument should be a dictionary, eg. "\n            "{'my_variable': 'my_value'}"\n        ),\n    )\n    bypass_cache: bool = Field(\n        default=False, description="If set, bypass the adapter-level cache of database state"\n    )\n    warn_error: bool = Field(\n        default=False,\n        description=(\n            "If dbt would normally warn, instead raise an exception. Examples include --models "\n            "that selects nothing, deprecations, configurations with no associated models, "\n            "invalid test configurations, and missing sources/refs in tests."\n        ),\n    )\n    dbt_executable: str = Field(\n        default=DEFAULT_DBT_EXECUTABLE,\n        description=f"Path to the dbt executable. Default is {DEFAULT_DBT_EXECUTABLE}",\n    )\n    ignore_handled_error: bool = Field(\n        default=False,\n        description=(\n            "When True, will not raise an exception when the dbt CLI returns error code 1. "\n            "Default is False."\n        ),\n    )\n    target_path: str = Field(\n        default=DEFAULT_DBT_TARGET_PATH,\n        description=(\n            "The directory path for target if different from the default `target-path` in "\n            "your dbt project configuration file."\n        ),\n    )\n    docs_url: Optional[str] = Field(\n        default=None, description="The url for where dbt docs are being served for this project."\n    )\n    json_log_format: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--log-format json` flag, allowing "\n            "Dagster to parse the log messages and emit simpler log messages to the event log."\n        ),\n    )\n    capture_logs: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--capture-output` flag, allowing "\n            "Dagster to capture the logs and emit them to the event log."\n        ),\n    )\n    debug: bool = Field(\n        default=False,\n        description=(\n            "When True, dbt will invoked with the `--debug` flag, which will print "\n            "additional debug information to the console."\n        ),\n    )\n\n\nclass DbtCliClient(DbtClient):\n    """A resource that allows you to execute dbt cli commands.\n\n    For the most up-to-date documentation on the specific parameters available to you for each\n    command, check out the dbt docs:\n\n    https://docs.getdbt.com/reference/commands/run\n\n    To use this as a dagster resource, we recommend using\n    :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n    """\n\n    def __init__(\n        self,\n        executable: str,\n        default_flags: Mapping[str, Any],\n        warn_error: bool,\n        ignore_handled_error: bool,\n        target_path: str,\n        logger: Optional[Any] = None,\n        docs_url: Optional[str] = None,\n        json_log_format: bool = True,\n        capture_logs: bool = True,\n        debug: bool = False,\n    ):\n        self._default_flags = default_flags\n        self._executable = executable\n        self._warn_error = warn_error\n        self._ignore_handled_error = ignore_handled_error\n        self._target_path = target_path\n        self._docs_url = docs_url\n        self._json_log_format = json_log_format\n        self._capture_logs = capture_logs\n        self._debug = debug\n        super().__init__(logger)\n\n    @property\n    def default_flags(self) -> Mapping[str, Any]:\n        """A set of params populated from resource config that are passed as flags to each dbt CLI command."""\n        return self._format_params(self._default_flags, replace_underscores=True)\n\n    @property\n    def strict_flags(self) -> Set[str]:\n        """A set of flags that should not be auto-populated from the default flags unless they are\n        arguments to the associated function.\n        """\n        return {"models", "exclude", "select"}\n\n    def _get_flags_dict(self, kwargs) -> Mapping[str, Any]:\n        extra_flags = {} if kwargs is None else kwargs\n\n        # remove default flags that are declared as "strict" and not explicitly passed in\n        default_flags = {\n            k: v\n            for k, v in self.default_flags.items()\n            if not (k in self.strict_flags and k not in extra_flags)\n        }\n\n        return merge_dicts(\n            default_flags, self._format_params(extra_flags, replace_underscores=True)\n        )\n\n    @public\n    def cli(self, command: str, **kwargs) -> DbtCliOutput:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        command = check.str_param(command, "command")\n        return execute_cli(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            target_path=self._target_path,\n            docs_url=self._docs_url,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        )\n\n    def cli_stream_json(self, command: str, **kwargs) -> Iterator[Mapping[str, Any]]:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n        """\n        check.invariant(self._json_log_format, "Cannot stream JSON if json_log_format is False.")\n        for event in execute_cli_stream(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        ):\n            if event.parsed_json_line is not None:\n                yield event.parsed_json_line\n\n    @public\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n            select (List[str], optional): the models to include in compilation.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("compile", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n            select (List[str], optional): the models to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("run", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("snapshot", select=select, exclude=exclude, **kwargs)\n\n    @public\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n            select (List[str], optional): the models to include in testing.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        if data and schema:\n            # do not include these arguments if both are True, as these are deprecated in later\n            # versions of dbt, and for older versions the functionality is the same regardless of\n            # if both are set or neither are set.\n            return self.cli("test", models=models, exclude=exclude, select=select, **kwargs)\n        return self.cli(\n            "test",\n            models=models,\n            exclude=exclude,\n            data=data,\n            schema=schema,\n            select=select,\n            **kwargs,\n        )\n\n    @public\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)\n\n    @public\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)\n\n    @public\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("build", select=select, **kwargs)\n\n    @public\n    def freshness(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the sources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("source snapshot-freshness", select=select, **kwargs)\n\n    @public\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("docs generate", compile=compile_project, **kwargs)\n\n    @public\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtCliOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli(f"run-operation {macro}", args=args, **kwargs)\n\n    @public\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_run_results(project_dir, target_path)\n\n    @public\n    def remove_run_results_json(self, **kwargs):\n        """Remove the run_results.json file from previous runs (if it exists)."""\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        remove_run_results(project_dir, target_path)\n\n    @public\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_manifest(project_dir, target_path)\n\n\nclass DbtCliClientResource(ConfigurableResourceWithCliFlags, IAttachDifferentObjectToOpContext):\n    """Resource which issues dbt CLI commands against a configured dbt project."""\n\n    class Config:\n        extra = "allow"\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def get_dbt_client(self) -> DbtCliClient:\n        context = self.get_resource_context()\n        default_flags = {\n            k: v\n            for k, v in self._get_non_none_public_field_values().items()\n            if k not in COMMON_OPTION_KEYS\n        }\n\n        return DbtCliClient(\n            executable=self.dbt_executable,\n            default_flags=default_flags,\n            warn_error=self.warn_error,\n            ignore_handled_error=self.ignore_handled_error,\n            target_path=self.target_path,\n            docs_url=self.docs_url,\n            logger=context.log,\n            json_log_format=self.json_log_format,\n            capture_logs=self.capture_logs,\n            debug=self.debug,\n        )\n\n    def get_object_to_set_on_execution_context(self) -> Any:\n        return self.get_dbt_client()\n\n\n
[docs]@deprecated(breaking_version="0.21", additional_warn_text="Use DbtCliResource instead.")\n@dagster_maintained_resource\n@resource(config_schema=DbtCliClientResource.to_config_schema())\ndef dbt_cli_resource(context) -> DbtCliClient:\n """This resource issues dbt CLI commands against a configured dbt project. It is deprecated\n in favor of :py:class:`~dagster_dbt.DbtCliResource`.\n """\n # all config options that are intended to be used as flags for dbt commands\n\n default_flags = {\n k: v for k, v in context.resource_config.items() if k not in COMMON_OPTION_KEYS\n }\n return DbtCliClient(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n docs_url=context.resource_config.get("docs_url"),\n capture_logs=context.resource_config["capture_logs"],\n json_log_format=context.resource_config["json_log_format"],\n debug=context.resource_config["debug"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/core/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources"}, "resources_v2": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources_v2

\nimport atexit\nimport contextlib\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport uuid\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Union,\n)\n\nimport dateutil.parser\nimport orjson\nfrom dagster import (\n    AssetCheckResult,\n    AssetCheckSeverity,\n    AssetObservation,\n    AssetsDefinition,\n    ConfigurableResource,\n    Output,\n    get_dagster_logger,\n)\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidPropertyError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dbt.contracts.results import NodeStatus, TestStatus\nfrom dbt.node_types import NodeType\nfrom dbt.version import __version__ as dbt_version\nfrom packaging import version\nfrom pydantic import Field, root_validator, validator\nfrom typing_extensions import Literal\n\nfrom ..asset_utils import (\n    get_manifest_and_translator_from_dbt_assets,\n    output_name_fn,\n)\nfrom ..dagster_dbt_translator import DagsterDbtTranslator\nfrom ..dbt_manifest import DbtManifestParam, validate_manifest\nfrom ..errors import DagsterDbtCliRuntimeError\nfrom ..utils import ASSET_RESOURCE_TYPES, get_dbt_resource_props_by_dbt_unique_id_from_manifest\n\nlogger = get_dagster_logger()\n\n\nDBT_PROJECT_YML_NAME = "dbt_project.yml"\nDBT_PROFILES_YML_NAME = "profiles.yml"\nPARTIAL_PARSE_FILE_NAME = "partial_parse.msgpack"\n\n\ndef _get_dbt_target_path() -> Path:\n    return Path(os.getenv("DBT_TARGET_PATH", "target"))\n\n\n
[docs]@dataclass\nclass DbtCliEventMessage:\n """The representation of a dbt CLI event.\n\n Args:\n raw_event (Dict[str, Any]): The raw event dictionary.\n See https://docs.getdbt.com/reference/events-logging#structured-logging for more\n information.\n """\n\n raw_event: Dict[str, Any]\n\n @classmethod\n def from_log(cls, log: str) -> "DbtCliEventMessage":\n """Parse an event according to https://docs.getdbt.com/reference/events-logging#structured-logging.\n\n We assume that the log format is json.\n """\n raw_event: Dict[str, Any] = orjson.loads(log)\n\n return cls(raw_event=raw_event)\n\n def __str__(self) -> str:\n return self.raw_event["info"]["msg"]\n\n
[docs] @public\n def to_default_asset_events(\n self,\n manifest: DbtManifestParam,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n ) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Convert a dbt CLI event to a set of corresponding Dagster events.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The dbt manifest blob.\n dagster_dbt_translator (DagsterDbtTranslator): Optionally, a custom translator for\n linking dbt nodes to Dagster assets.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n """\n if self.raw_event["info"]["level"] == "debug":\n return\n\n event_node_info: Dict[str, Any] = self.raw_event["data"].get("node_info")\n if not event_node_info:\n return\n\n manifest = validate_manifest(manifest)\n\n if not manifest:\n logger.info(\n "No dbt manifest was provided. Dagster events for dbt tests will not be created."\n )\n\n invocation_id: str = self.raw_event["info"]["invocation_id"]\n unique_id: str = event_node_info["unique_id"]\n node_resource_type: str = event_node_info["resource_type"]\n node_status: str = event_node_info["node_status"]\n\n is_node_successful = node_status == NodeStatus.Success\n is_node_finished = bool(event_node_info.get("node_finished_at"))\n if node_resource_type in NodeType.refable() and is_node_successful:\n started_at = dateutil.parser.isoparse(event_node_info["node_started_at"])\n finished_at = dateutil.parser.isoparse(event_node_info["node_finished_at"])\n duration_seconds = (finished_at - started_at).total_seconds()\n\n yield Output(\n value=None,\n output_name=output_name_fn(event_node_info),\n metadata={\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "Execution Duration": duration_seconds,\n },\n )\n elif manifest and node_resource_type == NodeType.Test and is_node_finished:\n upstream_unique_ids: List[str] = manifest["parent_map"][unique_id]\n test_resource_props = manifest["nodes"][unique_id]\n metadata = {\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "status": node_status,\n }\n\n is_asset_check = dagster_dbt_translator.settings.enable_asset_checks\n attached_node_unique_id = test_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n if is_asset_check and is_generic_test:\n is_test_successful = node_status == TestStatus.Pass\n severity = AssetCheckSeverity(test_resource_props["config"]["severity"].upper())\n\n attached_node_resource_props: Dict[str, Any] = manifest["nodes"].get(\n attached_node_unique_id\n ) or manifest["sources"].get(attached_node_unique_id)\n attached_node_asset_key = dagster_dbt_translator.get_asset_key(\n attached_node_resource_props\n )\n\n yield AssetCheckResult(\n passed=is_test_successful,\n asset_key=attached_node_asset_key,\n check_name=event_node_info["node_name"],\n metadata=metadata,\n severity=severity,\n )\n else:\n for upstream_unique_id in upstream_unique_ids:\n upstream_resource_props: Dict[str, Any] = manifest["nodes"].get(\n upstream_unique_id\n ) or manifest["sources"].get(upstream_unique_id)\n upstream_asset_key = dagster_dbt_translator.get_asset_key(\n upstream_resource_props\n )\n\n yield AssetObservation(\n asset_key=upstream_asset_key,\n metadata=metadata,\n )
\n\n\n
[docs]@dataclass\nclass DbtCliInvocation:\n """The representation of an invoked dbt command.\n\n Args:\n process (subprocess.Popen): The process running the dbt command.\n manifest (Mapping[str, Any]): The dbt manifest blob.\n project_dir (Path): The path to the dbt project.\n target_path (Path): The path to the dbt target folder.\n raise_on_error (bool): Whether to raise an exception if the dbt command fails.\n """\n\n process: subprocess.Popen\n manifest: Mapping[str, Any]\n dagster_dbt_translator: DagsterDbtTranslator\n project_dir: Path\n target_path: Path\n raise_on_error: bool\n\n @classmethod\n def run(\n cls,\n args: List[str],\n env: Dict[str, str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n project_dir: Path,\n target_path: Path,\n raise_on_error: bool,\n ) -> "DbtCliInvocation":\n # Attempt to take advantage of partial parsing. If there is a `partial_parse.msgpack` in\n # in the target folder, then copy it to the dynamic target path.\n #\n # This effectively allows us to skip the parsing of the manifest, which can be expensive.\n # See https://docs.getdbt.com/reference/programmatic-invocations#reusing-objects for more\n # details.\n current_target_path = _get_dbt_target_path()\n partial_parse_file_path = (\n current_target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n if current_target_path.is_absolute()\n else project_dir.joinpath(current_target_path, PARTIAL_PARSE_FILE_NAME)\n )\n partial_parse_destination_target_path = target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n\n if partial_parse_file_path.exists():\n logger.info(\n f"Copying `{partial_parse_file_path}` to `{partial_parse_destination_target_path}`"\n " to take advantage of partial parsing."\n )\n\n partial_parse_destination_target_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy(partial_parse_file_path, partial_parse_destination_target_path)\n\n # Create a subprocess that runs the dbt CLI command.\n logger.info(f"Running dbt command: `{' '.join(args)}`.")\n process = subprocess.Popen(\n args=args,\n stdout=subprocess.PIPE,\n stderr=subprocess.STDOUT,\n env=env,\n cwd=project_dir,\n )\n\n # Add handler to terminate child process if running.\n # See https://stackoverflow.com/a/18258391 for more details.\n def cleanup_dbt_subprocess(process: subprocess.Popen) -> None:\n if process.returncode is None:\n logger.info(\n "The main process is being terminated, but the dbt command has not yet"\n " completed. Terminating the execution of dbt command."\n )\n process.terminate()\n process.wait()\n\n atexit.register(cleanup_dbt_subprocess, process)\n\n return cls(\n process=process,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )\n\n
[docs] @public\n def wait(self) -> "DbtCliInvocation":\n """Wait for the dbt CLI process to complete.\n\n Returns:\n DbtCliInvocation: The current representation of the dbt CLI invocation.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n """\n list(self.stream_raw_events())\n\n return self
\n\n
[docs] @public\n def is_successful(self) -> bool:\n """Return whether the dbt CLI process completed successfully.\n\n Returns:\n bool: True, if the dbt CLI process returns with a zero exit code, and False otherwise.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\n if dbt_cli_invocation.is_successful():\n ...\n """\n return self.process.wait() == 0
\n\n
[docs] @public\n def stream(self) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Stream the events from the dbt CLI process and convert them to Dagster events.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n\n Examples:\n .. code-block:: python\n\n from pathlib import Path\n from dagster_dbt import DbtCliResource, dbt_assets\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n """\n for event in self.stream_raw_events():\n yield from event.to_default_asset_events(\n manifest=self.manifest, dagster_dbt_translator=self.dagster_dbt_translator\n )
\n\n
[docs] @public\n def stream_raw_events(self) -> Iterator[DbtCliEventMessage]:\n """Stream the events from the dbt CLI process.\n\n Returns:\n Iterator[DbtCliEventMessage]: An iterator of events from the dbt CLI process.\n """\n with self.process.stdout or contextlib.nullcontext():\n for raw_line in self.process.stdout or []:\n log: str = raw_line.decode().strip()\n try:\n event = DbtCliEventMessage.from_log(log=log)\n\n # Re-emit the logs from dbt CLI process into stdout.\n sys.stdout.write(str(event) + "\\n")\n sys.stdout.flush()\n\n yield event\n except:\n # If we can't parse the log, then just emit it as a raw log.\n sys.stdout.write(log + "\\n")\n sys.stdout.flush()\n\n # Ensure that the dbt CLI process has completed.\n self._raise_on_error()
\n\n
[docs] @public\n def get_artifact(\n self,\n artifact: Union[\n Literal["manifest.json"],\n Literal["catalog.json"],\n Literal["run_results.json"],\n Literal["sources.json"],\n ],\n ) -> Dict[str, Any]:\n """Retrieve a dbt artifact from the target path.\n\n See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.\n\n Args:\n artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]): The name of the artifact to retrieve.\n\n Returns:\n Dict[str, Any]: The artifact as a dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n\n # Retrieve the run_results.json artifact.\n run_results = dbt_cli_invocation.get_artifact("run_results.json")\n """\n artifact_path = self.target_path.joinpath(artifact)\n\n return orjson.loads(artifact_path.read_bytes())
\n\n def _raise_on_error(self) -> None:\n """Ensure that the dbt CLI process has completed. If the process has not successfully\n completed, then optionally raise an error.\n """\n if not self.is_successful() and self.raise_on_error:\n raise DagsterDbtCliRuntimeError(\n description=(\n f"The dbt CLI process failed with exit code {self.process.returncode}. Check"\n " the Dagster compute logs for the full information about the error, or view"\n f" the dbt debug log file: {self.target_path.joinpath('dbt.log')}."\n )\n )
\n\n\n
[docs]class DbtCliResource(ConfigurableResource):\n """A resource used to execute dbt CLI commands.\n\n Attributes:\n project_dir (str): The path to the dbt project directory. This directory should contain a\n `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more\n information.\n global_config_flags (List[str]): A list of global flags configuration to pass to the dbt CLI\n invocation. See https://docs.getdbt.com/reference/global-configs for a full list of\n configuration.\n profiles_dir (Optional[str]): The path to the directory containing your dbt `profiles.yml`.\n By default, the current working directory is used, which is the dbt project directory.\n See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n profile (Optional[str]): The profile from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n target (Optional[str]): The target from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n\n Examples:\n Creating a dbt resource with only a reference to ``project_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n Creating a dbt resource with a custom ``profiles_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n )\n\n Creating a dbt resource with a custom ``profile`` and ``target``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n profile="jaffle_shop",\n target="dev",\n )\n\n Creating a dbt resource with global configs, e.g. disabling colored logs with ``--no-use-color``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n global_config_flags=["--no-use-color"],\n )\n """\n\n project_dir: str = Field(\n ...,\n description=(\n "The path to your dbt project directory. This directory should contain a"\n " `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more"\n " information."\n ),\n )\n global_config_flags: List[str] = Field(\n default=[],\n description=(\n "A list of global flags configuration to pass to the dbt CLI invocation. See"\n " https://docs.getdbt.com/reference/global-configs for a full list of configuration."\n ),\n )\n profiles_dir: Optional[str] = Field(\n default=None,\n description=(\n "The path to the directory containing your dbt `profiles.yml`. By default, the current"\n " working directory is used, which is the dbt project directory."\n " See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for "\n " more information."\n ),\n )\n profile: Optional[str] = Field(\n default=None,\n description=(\n "The profile from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n target: Optional[str] = Field(\n default=None,\n description=(\n "The target from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n\n @classmethod\n def _validate_absolute_path_exists(cls, path: Union[str, Path]) -> Path:\n absolute_path = Path(path).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{path}' ('{absolute_path}') does not exist")\n\n return resolved_path\n\n @classmethod\n def _validate_path_contains_file(cls, path: Path, file_name: str, error_message: str):\n if not path.joinpath(file_name).exists():\n raise ValueError(error_message)\n\n @validator("project_dir", "profiles_dir", pre=True)\n def convert_path_to_str(cls, v: Any) -> Any:\n """Validate that the path is converted to a string."""\n if isinstance(v, Path):\n resolved_path = cls._validate_absolute_path_exists(v)\n\n absolute_path = Path(v).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{v}' ('{absolute_path}') does not exist")\n return os.fspath(resolved_path)\n\n return v\n\n @validator("project_dir")\n def validate_project_dir(cls, project_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(project_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROJECT_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROJECT_YML_NAME} file. Please"\n " specify a valid path to a dbt project."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @validator("profiles_dir")\n def validate_profiles_dir(cls, profiles_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(profiles_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROFILES_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROFILES_YML_NAME} file. Please"\n " specify a valid path to a dbt profile directory."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @root_validator(pre=True)\n def validate_dbt_version(cls, values: Dict[str, Any]) -> Dict[str, Any]:\n """Validate that the dbt version is supported."""\n if version.parse(dbt_version) < version.parse("1.4.0"):\n raise ValueError(\n "To use `dagster_dbt.DbtCliResource`, you must use `dbt-core>=1.4.0`. Currently,"\n f" you are using `dbt-core=={dbt_version}`. Please install a compatible dbt-core"\n " version."\n )\n\n return values\n\n def _get_unique_target_path(self, *, context: Optional[OpExecutionContext]) -> Path:\n """Get a unique target path for the dbt CLI invocation.\n\n Args:\n context (Optional[OpExecutionContext]): The execution context.\n\n Returns:\n str: A unique target path for the dbt CLI invocation.\n """\n unique_id = str(uuid.uuid4())[:7]\n path = unique_id\n if context:\n path = f"{context.op.name}-{context.run_id[:7]}-{unique_id}"\n\n current_target_path = _get_dbt_target_path()\n\n return current_target_path.joinpath(path)\n\n
[docs] @public\n def cli(\n self,\n args: List[str],\n *,\n raise_on_error: bool = True,\n manifest: Optional[DbtManifestParam] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n context: Optional[OpExecutionContext] = None,\n target_path: Optional[Path] = None,\n ) -> DbtCliInvocation:\n """Create a subprocess to execute a dbt CLI command.\n\n Args:\n args (List[str]): The dbt CLI command to execute.\n raise_on_error (bool): Whether to raise an exception if the dbt CLI command fails.\n manifest (Optional[Union[Mapping[str, Any], str, Path]]): The dbt manifest blob. If an\n execution context from within `@dbt_assets` is provided to the context argument,\n then the manifest provided to `@dbt_assets` will be used.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): The translator to link dbt\n nodes to Dagster assets. If an execution context from within `@dbt_assets` is\n provided to the context argument, then the dagster_dbt_translator provided to\n `@dbt_assets` will be used.\n context (Optional[OpExecutionContext]): The execution context from within `@dbt_assets`.\n target_path (Optional[Path]): An explicit path to a target folder to use to store and\n retrieve dbt artifacts when running a dbt CLI command. If not provided, a unique\n target path will be generated.\n\n Returns:\n DbtCliInvocation: A invocation instance that can be used to retrieve the output of the\n dbt CLI command.\n\n Examples:\n Streaming Dagster events for dbt asset materializations and observations:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n\n Retrieving a dbt artifact after streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context)\n\n yield from dbt_run_invocation.stream()\n\n # Retrieve the `run_results.json` dbt artifact as a dictionary:\n run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n # Retrieve the `run_results.json` dbt artifact as a file path:\n run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n\n Customizing the asset materialization metadata when streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n for dbt_event in dbt_cli_invocation.stream_raw_events():\n for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n if isinstance(dagster_event, Output):\n context.add_output_metadata(\n metadata={\n "my_custom_metadata": "my_custom_metadata_value",\n },\n output_name=dagster_event.output_name,\n )\n\n yield dagster_event\n\n Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n if dbt_run_invocation.is_successful():\n yield from dbt_run_invocation.stream()\n else:\n ...\n\n Invoking a dbt CLI command in a custom asset or op:\n\n .. code-block:: python\n\n import json\n\n from dagster import asset, op\n from dagster_dbt import DbtCliResource\n\n\n @asset\n def my_dbt_asset(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n @op\n def my_dbt_op(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n """\n target_path = target_path or self._get_unique_target_path(context=context)\n env = {\n **os.environ.copy(),\n # Run dbt with unbuffered output.\n "PYTHONUNBUFFERED": "1",\n # Disable anonymous usage statistics for performance.\n "DBT_SEND_ANONYMOUS_USAGE_STATS": "false",\n # The DBT_LOG_FORMAT environment variable must be set to `json`. We use this\n # environment variable to ensure that the dbt CLI outputs structured logs.\n "DBT_LOG_FORMAT": "json",\n # The DBT_TARGET_PATH environment variable is set to a unique value for each dbt\n # invocation so that artifact paths are separated.\n # See https://discourse.getdbt.com/t/multiple-run-results-json-and-manifest-json-files/7555\n # for more information.\n "DBT_TARGET_PATH": os.fspath(target_path),\n # The DBT_LOG_PATH environment variable is set to the same value as DBT_TARGET_PATH\n # so that logs for each dbt invocation has separate log files.\n "DBT_LOG_PATH": os.fspath(target_path),\n # The DBT_PROFILES_DIR environment variable is set to the path containing the dbt\n # profiles.yml file.\n # See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory\n # for more information.\n **({"DBT_PROFILES_DIR": self.profiles_dir} if self.profiles_dir else {}),\n }\n\n assets_def: Optional[AssetsDefinition] = None\n with suppress(DagsterInvalidPropertyError):\n assets_def = context.assets_def if context else None\n\n selection_args: List[str] = []\n dagster_dbt_translator = dagster_dbt_translator or DagsterDbtTranslator()\n if context and assets_def is not None:\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(\n [assets_def]\n )\n\n # When dbt is enabled with asset checks, we turn off any indirection with dbt selection.\n # This way, the Dagster context completely determines what is executed in a dbt\n # invocation with a subsetted selection.\n if (\n version.parse(dbt_version) >= version.parse("1.5.0")\n and dagster_dbt_translator.settings.enable_asset_checks\n ):\n env["DBT_INDIRECT_SELECTION"] = "empty"\n\n selection_args = get_subset_selection_for_context(\n context=context,\n manifest=manifest,\n select=context.op.tags.get("dagster-dbt/select"),\n exclude=context.op.tags.get("dagster-dbt/exclude"),\n )\n else:\n manifest = validate_manifest(manifest) if manifest else {}\n\n # TODO: verify that args does not have any selection flags if the context and manifest\n # are passed to this function.\n profile_args: List[str] = []\n if self.profile:\n profile_args = ["--profile", self.profile]\n\n if self.target:\n profile_args += ["--target", self.target]\n\n args = ["dbt"] + self.global_config_flags + args + profile_args + selection_args\n project_dir = Path(self.project_dir)\n\n if not target_path.is_absolute():\n target_path = project_dir.joinpath(target_path)\n\n return DbtCliInvocation.run(\n args=args,\n env=env,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )
\n\n\ndef get_subset_selection_for_context(\n context: OpExecutionContext,\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n) -> List[str]:\n """Generate a dbt selection string to materialize the selected resources in a subsetted execution context.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work.\n\n Args:\n context (OpExecutionContext): The execution context for the current execution step.\n select (Optional[str]): A dbt selection string to select resources to materialize.\n exclude (Optional[str]): A dbt selection string to exclude resources from materializing.\n\n Returns:\n List[str]: dbt CLI arguments to materialize the selected resources in a\n subsetted execution context.\n\n If the current execution context is not performing a subsetted execution,\n return CLI arguments composed of the inputed selection and exclusion arguments.\n """\n default_dbt_selection = []\n if select:\n default_dbt_selection += ["--select", select]\n if exclude:\n default_dbt_selection += ["--exclude", exclude]\n\n dbt_resource_props_by_output_name = get_dbt_resource_props_by_output_name(manifest)\n dbt_resource_props_by_test_name = get_dbt_resource_props_by_test_name(manifest)\n\n # TODO: this should be a property on the context if this is a permanent indicator for\n # determining whether the current execution context is performing a subsetted execution.\n is_subsetted_execution = len(context.selected_output_names) != len(\n context.assets_def.node_keys_by_output_name\n )\n if not is_subsetted_execution:\n logger.info(\n "A dbt subsetted execution is not being performed. Using the default dbt selection"\n f" arguments `{default_dbt_selection}`."\n )\n return default_dbt_selection\n\n selected_dbt_resources = []\n for output_name in context.selected_output_names:\n dbt_resource_props = dbt_resource_props_by_output_name[output_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(dbt_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n for _, check_name in context.selected_asset_check_keys:\n test_resource_props = dbt_resource_props_by_test_name[check_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(test_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n # Take the union of all the selected resources.\n # https://docs.getdbt.com/reference/node-selection/set-operators#unions\n union_selected_dbt_resources = ["--select"] + [" ".join(selected_dbt_resources)]\n\n logger.info(\n "A dbt subsetted execution is being performed. Overriding default dbt selection"\n f" arguments `{default_dbt_selection}` with arguments: `{union_selected_dbt_resources}`"\n )\n\n return union_selected_dbt_resources\n\n\ndef get_dbt_resource_props_by_output_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n\n return {\n output_name_fn(node): node\n for node in node_info_by_dbt_unique_id.values()\n if node["resource_type"] in ASSET_RESOURCE_TYPES\n }\n\n\ndef get_dbt_resource_props_by_test_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n return {\n dbt_resource_props["name"]: dbt_resource_props\n for unique_id, dbt_resource_props in manifest["nodes"].items()\n if unique_id.startswith("test")\n }\n
", "current_page_name": "_modules/dagster_dbt/core/resources_v2", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources_v2"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.types

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n This class is deprecated, because it's only produced by methods of the DbtCliClientResource class,\n which is deprecated in favor of DbtCliResource.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n docs_url (Optional[str]): Hostname where dbt docs are being served for this project.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: Sequence[Mapping[str, Any]],\n result: Mapping[str, Any],\n docs_url: Optional[str] = None,\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.sequence_param(logs, "logs", of_type=dict)\n self._docs_url = check.opt_str_param(docs_url, "docs_url")\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> Sequence[Mapping[str, Any]]:\n return self._logs\n\n @property\n def docs_url(self) -> Optional[str]:\n return self._docs_url
\n
", "current_page_name": "_modules/dagster_dbt/core/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.types"}}, "dagster_dbt_translator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dagster_dbt_translator

\nfrom dataclasses import dataclass\nfrom typing import Any, Mapping, Optional\n\nfrom dagster import AssetKey, AutoMaterializePolicy, FreshnessPolicy\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import (\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\n\nfrom .asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n)\n\n\n
[docs]@dataclass(frozen=True)\nclass DagsterDbtTranslatorSettings:\n """Settings to enable Dagster features for your dbt project.\n\n Args:\n enable_asset_checks (bool): Whether to load dbt tests as Dagster asset checks.\n Defaults to False.\n """\n\n enable_asset_checks: bool = False
\n\n\n
[docs]class DagsterDbtTranslator:\n """Holds a set of methods that derive Dagster asset definition metadata given a representation\n of a dbt resource (models, tests, sources, etc).\n\n This class is exposed so that methods can be overriden to customize how Dagster asset metadata\n is derived.\n """\n\n def __init__(self, settings: Optional[DagsterDbtTranslatorSettings] = None):\n """Initialize the translator.\n\n Args:\n settings (Optional[DagsterDbtTranslatorSettings]): Settings for the translator.\n """\n self._settings = settings or DagsterDbtTranslatorSettings()\n\n @property\n def settings(self) -> DagsterDbtTranslatorSettings:\n if not hasattr(self, "_settings"):\n self._settings = DagsterDbtTranslatorSettings()\n\n return self._settings\n\n
[docs] @classmethod\n @public\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster asset key that represents that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom asset key for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n AssetKey: The Dagster asset key for the dbt resource.\n\n Examples:\n Adding a prefix to the default asset key generated for each dbt resource:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n\n Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n asset_key = super().get_asset_key(dbt_resource_props)\n\n if dbt_resource_props["resource_type"] == "source":\n asset_key = asset_key.with_prefix("my_prefix")\n\n return asset_key\n """\n return default_asset_key_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster description for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom description for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n str: The description for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n return "custom description"\n """\n return default_description_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster metadata for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom metadata for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Mapping[str, Any]: A dictionary representing the Dagster metadata for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n return {"custom": "metadata"}\n """\n return default_metadata_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster group name for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom group name for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[str]: A Dagster group name.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n """\n return default_group_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.FreshnessPolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom freshness policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[FreshnessPolicy]: A Dagster freshness policy.\n\n Examples:\n Set a custom freshness policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n return FreshnessPolicy(maximum_lag_minutes=60)\n\n Set a custom freshness policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n freshness_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n return freshness_policy\n """\n return default_freshness_policy_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.AutoMaterializePolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom auto-materialize policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[AutoMaterializePolicy]: A Dagster auto-materialize policy.\n\n Examples:\n Set a custom auto-materialize policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n return AutoMaterializePolicy.eager()\n\n Set a custom auto-materialize policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n auto_materialize_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n auto_materialize_policy = AutoMaterializePolicy.eager()\n\n return auto_materialize_policy\n\n """\n return default_auto_materialize_policy_fn(dbt_resource_props)
\n\n\nclass KeyPrefixDagsterDbtTranslator(DagsterDbtTranslator):\n """A DagsterDbtTranslator that applies prefixes to the asset keys generated from dbt resources.\n\n Attributes:\n asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt models,\n seeds, snapshots, etc. This will *not* apply to dbt sources.\n source_asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt\n sources.\n """\n\n def __init__(\n self,\n asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *args,\n **kwargs,\n ):\n self._asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(asset_key_prefix, "asset_key_prefix")\n or []\n )\n self._source_asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(\n source_asset_key_prefix, "source_asset_key_prefix"\n )\n or []\n )\n\n super().__init__(*args, **kwargs)\n\n @public\n def get_asset_key(self, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n base_key = default_asset_key_fn(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(self._source_asset_key_prefix)\n else:\n return base_key.with_prefix(self._asset_key_prefix)\n\n\n@dataclass\nclass DbtManifestWrapper:\n manifest: Mapping[str, Any]\n
", "current_page_name": "_modules/dagster_dbt/dagster_dbt_translator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dagster_dbt_translator"}, "dbt_manifest_asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_manifest_asset_selection

\nfrom typing import AbstractSet, Optional\n\nfrom dagster import (\n    AssetKey,\n    AssetSelection,\n    _check as check,\n)\nfrom dagster._core.definitions.asset_graph import AssetGraph\n\nfrom .asset_utils import is_non_asset_node\nfrom .dagster_dbt_translator import DagsterDbtTranslator\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]class DbtManifestAssetSelection(AssetSelection):\n """Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.\n\n Args:\n manifest (Mapping[str, Any]): The dbt manifest blob.\n select (str): A dbt selection string to specify a set of dbt resources.\n exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Examples:\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster_dbt import DbtManifestAssetSelection\n\n manifest = json.loads(Path("path/to/manifest.json").read_text())\n\n # select the dbt assets that have the tag "foo".\n my_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n """\n\n def __init__(\n self,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n *,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n exclude: Optional[str] = None,\n ) -> None:\n self.manifest = validate_manifest(manifest)\n self.select = check.str_param(select, "select")\n self.exclude = check.opt_str_param(exclude, "exclude", default="")\n self.dagster_dbt_translator = check.opt_inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n DagsterDbtTranslator(),\n )\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n dbt_nodes = get_dbt_resource_props_by_dbt_unique_id_from_manifest(self.manifest)\n\n keys = set()\n for unique_id in select_unique_ids_from_manifest(\n select=self.select,\n exclude=self.exclude,\n manifest_json=self.manifest,\n ):\n dbt_resource_props = dbt_nodes[unique_id]\n is_dbt_asset = dbt_resource_props["resource_type"] in ASSET_RESOURCE_TYPES\n if is_dbt_asset and not is_non_asset_node(dbt_resource_props):\n asset_key = self.dagster_dbt_translator.get_asset_key(dbt_resource_props)\n keys.add(asset_key)\n\n return keys
\n
", "current_page_name": "_modules/dagster_dbt/dbt_manifest_asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_manifest_asset_selection"}, "dbt_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\nclass DbtClient:\n    """Base class for a client allowing users to interface with dbt."""\n\n    def __init__(\n        self,\n        logger: Optional[logging.Logger] = None,\n    ):\n        """Constructor.\n\n        Args:\n            logger (Optional[Any]): A property for injecting a logger dependency.\n                Default is ``None``.\n        """\n        self._logger = logger or get_dagster_logger()\n\n    def _format_params(\n        self, flags: Mapping[str, Any], replace_underscores: bool = False\n    ) -> Mapping[str, Any]:\n        """Reformats arguments that are easier to express as a list into the format that dbt expects,\n        and deletes and keys with no value.\n        """\n        # remove any keys with a value of None\n        if replace_underscores:\n            flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n        else:\n            flags = {k: v for k, v in flags.items() if v is not None}\n\n        for param in ["select", "exclude", "models"]:\n            if param in flags:\n                if isinstance(flags[param], list):\n                    # if it's a list, format as space-separated\n                    flags[param] = " ".join(set(flags[param]))\n\n        return flags\n\n    @property\n    def logger(self) -> logging.Logger:\n        """logging.Logger: A property for injecting a logger dependency."""\n        return self._logger\n\n    @abstractmethod\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n        raise NotImplementedError()\n\n    @abstractmethod\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n                for this dbt project.\n        """\n\n    @abstractmethod\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n\n\n
[docs]class DbtResource(DbtClient):\n pass
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.errors

\nimport warnings\nfrom abc import ABC\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Failure,\n    MetadataValue,\n    _check as check,\n)\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: Sequence[int]\n\n def __init__(self, invalid_line_nos: Sequence[int]):\n check.sequence_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata = {\n "Invalid CLI Output Line Numbers": MetadataValue.json({"line_nos": invalid_line_nos})\n }\n super().__init__(description, metadata=metadata)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(\n self,\n description: str,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n if logs is not None:\n warnings.warn(\n "`logs` is a deprecated argument to DagsterDbtCliRuntimeError and will be discarded"\n )\n if raw_output is not None:\n warnings.warn(\n "`raw_output` is a deprecated argument to DagsterDbtCliRuntimeError and will be"\n " discarded"\n )\n metadata = {"Parsed CLI Messages": "\\n".join(messages or [])}\n super().__init__(description, metadata=metadata)
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output, messages)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__(\n "Fatal error in the dbt CLI (return code 2): " + " ".join(messages or []),\n logs,\n raw_output,\n messages,\n )
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__(f"Expected to find file at path {path}")
\n\n\nclass DagsterDbtCloudJobInvariantViolationError(DagsterDbtError, DagsterInvariantViolationError):\n """Represents an error when a dbt Cloud job is not supported by the ``dagster-dbt`` library."""\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom .types import DbtOutput\nfrom .utils import generate_events, generate_materializations\n\n_DEFAULT_OP_PROPS: Dict[str, Any] = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n    """\n\n\n# NOTE: mypy fails to properly track the type of `_DEFAULT_OP_PROPS` items when they are\n# double-splatted, so we type-ignore the below op declarations.\n\n\nclass DbtBuildOpConfig(Config):\n    yield_asset_events: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations and asset observations corresponding to the results of "\n            "the dbt operation will be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n@op(**_DEFAULT_OP_PROPS)\ndef dbt_build_op(context, config: DbtBuildOpConfig) -> Any:\n    dbt_output = context.resources.dbt.build()\n    if config.yield_asset_events and "results" in dbt_output.result:\n        yield from generate_events(\n            dbt_output,\n            node_info_to_asset_key=lambda info: config.asset_key_prefix\n            + info["unique_id"].split("."),\n            manifest_json=context.resources.dbt.get_manifest_json(),\n        )\n    yield Output(dbt_output)\n\n\nclass DbtRunOpConfig(Config):\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: Optional[List[str]] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_run_op(context, config: DbtRunOpConfig):\n dbt_output = context.resources.dbt.run()\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor dbt_op, cmd in [\n (dbt_build_op, "build"),\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n dbt_op.__doc__ = _get_doc(dbt_op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.ops"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\n\n\n
[docs]class DbtOutput:\n """Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Mapping[str, Any]):\n self._result = check.mapping_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Mapping[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.utils

\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    MetadataValue,\n    Output,\n    _check as check,\n)\nfrom dagster._core.definitions.metadata import RawMetadataValue\n\nfrom .types import DbtOutput\n\n# dbt resource types that may be considered assets\nASSET_RESOURCE_TYPES = ["model", "seed", "snapshot"]\n\n\ndef default_node_info_to_asset_key(node_info: Mapping[str, Any]) -> AssetKey:\n    return AssetKey(node_info["unique_id"].split("."))\n\n\ndef _resource_type(unique_id: str) -> str:\n    # returns the type of the node (e.g. model, test, snapshot)\n    return unique_id.split(".")[0]\n\n\ndef input_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # * can be present when sources are sharded tables\n    return dbt_resource_props["unique_id"].replace(".", "_").replace("*", "_star")\n\n\ndef output_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # hyphens are valid in dbt model names, but not in output names\n    return dbt_resource_props["unique_id"].split(".")[-1].replace("-", "_")\n\n\ndef _node_result_to_metadata(node_result: Mapping[str, Any]) -> Mapping[str, RawMetadataValue]:\n    return {\n        "Materialization Strategy": node_result["config"]["materialized"],\n        "Database": node_result["database"],\n        "Schema": node_result["schema"],\n        "Alias": node_result["alias"],\n        "Description": node_result["description"],\n    }\n\n\ndef _timing_to_metadata(timings: Sequence[Mapping[str, Any]]) -> Mapping[str, RawMetadataValue]:\n    metadata: Dict[str, RawMetadataValue] = {}\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        # dateutil does not properly expose its modules to static checkers\n        started_at = dateutil.parser.isoparse(timing["started_at"])  # type: ignore\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                f"{desc} Started At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Completed At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Duration": duration.total_seconds(),\n            }\n        )\n    return metadata\n\n\ndef result_to_events(\n    result: Mapping[str, Any],\n    docs_url: Optional[str] = None,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n    extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n    generate_asset_outputs: bool = False,\n) -> Iterator[Union[AssetMaterialization, AssetObservation, Output]]:\n    """This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n    node_info_to_asset_key = check.opt_callable_param(\n        node_info_to_asset_key, "node_info_to_asset_key", default=default_node_info_to_asset_key\n    )\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        status = (\n            "fail"\n            if result.get("fail")\n            else "skip" if result.get("skip") else "error" if result.get("error") else "success"\n        )\n    else:\n        status = result["status"]\n\n    # all versions represent timing the same way\n    metadata = {"Status": status, "Execution Time (seconds)": result["execution_time"]}\n    metadata.update(_timing_to_metadata(result["timing"]))\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n        unique_id = result["node"]["unique_id"]\n        metadata.update(_node_result_to_metadata(result["node"]))\n    else:\n        unique_id = result["unique_id"]\n\n    if docs_url:\n        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")\n\n    if extra_metadata:\n        metadata.update(extra_metadata)\n\n    # if you have a manifest available, get the full node info, otherwise just populate unique_id\n    dbt_resource_props = (\n        manifest_json["nodes"][unique_id] if manifest_json else {"unique_id": unique_id}\n    )\n\n    node_resource_type = _resource_type(unique_id)\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and status == "success":\n        if generate_asset_outputs:\n            yield Output(\n                value=None,\n                output_name=output_name_fn(dbt_resource_props),\n                metadata=metadata,\n            )\n        else:\n            yield AssetMaterialization(\n                asset_key=node_info_to_asset_key(dbt_resource_props),\n                description=f"dbt node: {unique_id}",\n                metadata=metadata,\n            )\n    # can only associate tests with assets if we have manifest_json available\n    elif node_resource_type == "test" and manifest_json and status != "skipped":\n        upstream_unique_ids = manifest_json["nodes"][unique_id]["depends_on"]["nodes"]\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            dbt_resource_props = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if dbt_resource_props is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(dbt_resource_props)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": result["unique_id"],\n                    "Test Status": status,\n                    "Test Message": result.get("message") or "",\n                },\n            )\n\n\ndef generate_events(\n    dbt_output: DbtOutput,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n) -> Iterator[Union[AssetMaterialization, AssetObservation]]:\n    """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n    a dbt command, and :py:class:`dagster.AssetObservation` events for each test run.\n\n    Information parsed from a :py:class:`~DbtOutput` object.\n    """\n    for result in dbt_output.result["results"]:\n        for event in result_to_events(\n            result,\n            docs_url=dbt_output.docs_url,\n            node_info_to_asset_key=node_info_to_asset_key,\n            manifest_json=manifest_json,\n        ):\n            yield check.inst(\n                cast(Union[AssetMaterialization, AssetObservation], event),\n                (AssetMaterialization, AssetObservation),\n            )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput,\n asset_key_prefix: Optional[Sequence[str]] = None,\n) -> Iterator[AssetMaterialization]:\n """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n a dbt command.\n\n Information parsed from a :py:class:`~DbtOutput` object.\n\n Examples:\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n """\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for event in generate_events(\n dbt_output,\n node_info_to_asset_key=lambda info: AssetKey(\n asset_key_prefix + info["unique_id"].split(".")\n ),\n ):\n yield check.inst(cast(AssetMaterialization, event), AssetMaterialization)
\n\n\ndef select_unique_ids_from_manifest(\n select: str,\n exclude: str,\n state_path: Optional[str] = None,\n manifest_json_path: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n manifest_parsed: Optional[Any] = None,\n) -> AbstractSet[str]:\n """Method to apply a selection string to an existing manifest.json file."""\n import dbt.graph.cli as graph_cli\n import dbt.graph.selector as graph_selector\n from dbt.contracts.graph.manifest import Manifest, WritableManifest\n from dbt.contracts.state import PreviousState\n from dbt.graph.selector_spec import IndirectSelection, SelectionSpec\n from networkx import DiGraph\n\n if state_path is not None:\n previous_state = PreviousState(\n path=Path(state_path), # type: ignore # (unused path, slated for deletion)\n current_path=( # type: ignore # (unused path, slated for deletion)\n Path("/tmp/null") if manifest_json_path is None else Path(manifest_json_path)\n ),\n )\n else:\n previous_state = None\n\n if manifest_json_path is not None:\n manifest = WritableManifest.read_and_check_versions(manifest_json_path)\n child_map = manifest.child_map\n elif manifest_json is not None:\n\n class _DictShim(dict):\n """Shim to enable hydrating a dictionary into a dot-accessible object."""\n\n def __getattr__(self, item):\n ret = super().get(item)\n # allow recursive access e.g. foo.bar.baz\n return _DictShim(ret) if isinstance(ret, dict) else ret\n\n manifest = Manifest(\n # dbt expects dataclasses that can be accessed with dot notation, not bare dictionaries\n nodes={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["nodes"].items() # type: ignore\n },\n sources={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["sources"].items() # type: ignore\n },\n metrics={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["metrics"].items() # type: ignore\n },\n exposures={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["exposures"].items() # type: ignore\n },\n )\n child_map = manifest_json["child_map"]\n elif manifest_parsed is not None:\n manifest = manifest_parsed\n child_map = manifest.child_map\n else:\n check.failed("Must provide either a manifest_json_path, manifest_json, or manifest_parsed.")\n graph = graph_selector.Graph(DiGraph(incoming_graph_data=child_map))\n\n # create a parsed selection from the select string\n try:\n from dbt.flags import GLOBAL_FLAGS\n except ImportError:\n # dbt < 1.5.0 compat\n import dbt.flags as GLOBAL_FLAGS\n setattr(GLOBAL_FLAGS, "INDIRECT_SELECTION", IndirectSelection.Eager)\n setattr(GLOBAL_FLAGS, "WARN_ERROR", True)\n parsed_spec: SelectionSpec = graph_cli.parse_union([select], True)\n\n if exclude:\n parsed_spec = graph_cli.SelectionDifference(\n components=[parsed_spec, graph_cli.parse_union([exclude], True)]\n )\n\n # execute this selection against the graph\n selector = graph_selector.NodeSelector(graph, manifest, previous_state=previous_state)\n selected, _ = selector.select_nodes(parsed_spec)\n return selected\n\n\ndef get_dbt_resource_props_by_dbt_unique_id_from_manifest(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n """A mapping of a dbt node's unique id to the node's dictionary representation in the manifest."""\n return {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["exposures"],\n **manifest["metrics"],\n }\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_executor

\nfrom typing import Iterator, Optional, cast\n\nimport dagster._check as check\nimport docker\nimport docker.errors\nfrom dagster import Field, IntSource, executor\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster._core.executor.step_delegating.step_handler.base import (\n    CheckStepHealthResult,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes.utils import hash_str\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n "max_concurrent": Field(\n IntSource,\n is_required=False,\n description=(\n "Limit on the number of containers that will run concurrently within the scope "\n "of a Dagster run. Note that this limit is per run, not global."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n config = init_context.executor_config\n image = check.opt_str_elem(config, "image")\n registry = check.opt_dict_elem(config, "registry", key_type=str)\n env_vars = check.opt_list_elem(config, "env_vars", of_type=str)\n network = check.opt_str_elem(config, "network")\n networks = check.opt_list_elem(config, "networks", of_type=str)\n container_kwargs = check.opt_dict_elem(config, "container_kwargs", key_type=str)\n retries = check.dict_elem(config, "retries", key_type=str)\n max_concurrent = check.opt_int_elem(config, "max_concurrent")\n tag_concurrency_limits = check.opt_list_elem(config, "tag_concurrency_limits")\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network and not networks:\n networks = [network]\n\n container_context = DockerContainerContext(\n registry=registry,\n env_vars=env_vars or [],\n networks=networks or [],\n container_kwargs=container_kwargs,\n )\n\n return StepDelegatingExecutor(\n DockerStepHandler(image, container_context),\n retries=check.not_none(RetryMode.from_config(retries)),\n max_concurrent=max_concurrent,\n tag_concurrency_limits=tag_concurrency_limits,\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image: Optional[str],\n container_context: DockerContainerContext,\n ):\n super().__init__()\n\n self._image = check.opt_str_param(image, "image")\n self._container_context = check.inst_param(\n container_context, "container_context", DockerContainerContext\n )\n\n def _get_image(self, step_handler_context: StepHandlerContext):\n from . import DockerRunLauncher\n\n image = cast(\n JobPythonOrigin, step_handler_context.dagster_run.job_code_origin\n ).repository_origin.container_image\n if not image:\n image = self._image\n\n run_launcher = step_handler_context.instance.run_launcher\n\n if not image and isinstance(run_launcher, DockerRunLauncher):\n image = run_launcher.image\n\n if not image:\n raise Exception("No docker image specified by the executor config or repository")\n\n return image\n\n def _get_docker_container_context(self, step_handler_context: StepHandlerContext):\n # This doesn't vary per step: would be good to have a hook where it can be set once\n # for the whole StepHandler but we need access to the DagsterRun for that\n\n from .docker_run_launcher import DockerRunLauncher\n\n run_launcher = step_handler_context.instance.run_launcher\n run_target = DockerContainerContext.create_for_run(\n step_handler_context.dagster_run,\n run_launcher if isinstance(run_launcher, DockerRunLauncher) else None,\n )\n\n merged_container_context = run_target.merge(self._container_context)\n\n validate_docker_config(\n network=None,\n networks=merged_container_context.networks,\n container_kwargs=merged_container_context.container_kwargs,\n )\n\n return merged_container_context\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self, docker_container_context: DockerContainerContext):\n client = docker.client.from_env()\n if docker_container_context.registry:\n client.login(\n registry=docker_container_context.registry["url"],\n username=docker_container_context.registry["username"],\n password=docker_container_context.registry["password"],\n )\n return client\n\n def _get_container_name(self, execute_step_args: ExecuteStepArgs):\n run_id = execute_step_args.run_id\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n step_name = f"dagster-step-{hash_str(run_id + step_key)}"\n\n if execute_step_args.known_state:\n retry_state = execute_step_args.known_state.get_retry_state()\n retry_number = retry_state.get_attempt_count(step_key)\n if retry_number:\n step_name = f"{step_name}-{retry_number}"\n\n return step_name\n\n def _create_step_container(\n self,\n client,\n container_context,\n step_image,\n step_handler_context: StepHandlerContext,\n ):\n execute_step_args = step_handler_context.execute_step_args\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n env_vars["DAGSTER_RUN_JOB_NAME"] = step_handler_context.dagster_run.job_name\n env_vars["DAGSTER_RUN_STEP_KEY"] = step_key\n return client.containers.create(\n step_image,\n name=self._get_container_name(execute_step_args),\n detach=True,\n network=container_context.networks[0] if len(container_context.networks) else None,\n command=execute_step_args.get_command_args(),\n environment=env_vars,\n **container_context.container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n step_image = self._get_image(step_handler_context)\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message="Launching step in Docker container.",\n metadata={\n "Docker container id": step_container.id,\n },\n )\n step_container.start()\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n container = client.containers.get(container_name)\n\n if container.status == "running":\n return CheckStepHealthResult.healthy()\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n raise Exception(\n f"Container status is {container.status}. Raised exception attempting to get its"\n " return code."\n ) from e\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return CheckStepHealthResult.healthy()\n\n return CheckStepHealthResult.unhealthy(\n reason=f"Container status is {container.status}. Return code is {ret_code}."\n )\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert (\n len(step_keys_to_execute) == 1\n ), "Terminating multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Stopping Docker container {container_name} for step.",\n event_specific_data=EngineEventData(),\n )\n\n client = self._get_client(container_context)\n\n container = client.containers.get(container_name)\n\n container.stop()\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_run_launcher

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\nimport docker\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom typing_extensions import Self\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def get_container_context(self, dagster_run: DagsterRun) -> DockerContainerContext:\n return DockerContainerContext.create_for_run(dagster_run, self)\n\n def _get_client(self, container_context: DockerContainerContext):\n client = docker.client.from_env()\n if container_context.registry:\n client.login(\n registry=container_context.registry["url"],\n username=container_context.registry["username"],\n password=container_context.registry["password"],\n )\n return client\n\n def _get_docker_image(self, job_code_origin):\n docker_image = job_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n container_context = self.get_container_context(run)\n docker_env = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n docker_env["DAGSTER_RUN_JOB_NAME"] = run.job_name\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message=f"Launching run in a new container {container.id} with image {docker_image}",\n dagster_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ExecuteRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ResumeRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n container_context = self.get_container_context(run)\n\n try:\n return self._get_client(container_context).containers.get(container_id)\n except Exception:\n return None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n container = self._get_container(run)\n if container is None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_run_launcher"}, "ops": {"docker_container_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.ops.docker_container_op

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport docker\nfrom dagster import Field, In, Nothing, OpExecutionContext, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._core.utils import parse_env_var\nfrom dagster._serdes.utils import hash_str\n\nfrom ..container_context import DockerContainerContext\nfrom ..docker_run_launcher import DockerRunLauncher\nfrom ..utils import DOCKER_CONFIG_SCHEMA, validate_docker_image\n\nDOCKER_CONTAINER_OP_CONFIG = {\n    **DOCKER_CONFIG_SCHEMA,\n    "image": Field(\n        StringSource,\n        is_required=True,\n        description="The image in which to run the Docker container.",\n    ),\n    "entrypoint": Field(\n        [str],\n        is_required=False,\n        description="The ENTRYPOINT for the Docker container",\n    ),\n    "command": Field(\n        [str],\n        is_required=False,\n        description="The command to run in the container within the launched Docker container.",\n    ),\n}\n\n\ndef _get_client(docker_container_context: DockerContainerContext):\n    client = docker.client.from_env()\n    if docker_container_context.registry:\n        client.login(\n            registry=docker_container_context.registry["url"],\n            username=docker_container_context.registry["username"],\n            password=docker_container_context.registry["password"],\n        )\n    return client\n\n\ndef _get_container_name(run_id, op_name, retry_number):\n    container_name = hash_str(run_id + op_name)\n\n    if retry_number > 0:\n        container_name = f"{container_name}-{retry_number}"\n\n    return container_name\n\n\ndef _create_container(\n    op_context: OpExecutionContext,\n    client,\n    container_context: DockerContainerContext,\n    image: str,\n    entrypoint: Optional[Sequence[str]],\n    command: Optional[Sequence[str]],\n):\n    env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n    return client.containers.create(\n        image,\n        name=_get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),\n        detach=True,\n        network=container_context.networks[0] if len(container_context.networks) else None,\n        entrypoint=entrypoint,\n        command=command,\n        environment=env_vars,\n        **container_context.container_kwargs,\n    )\n\n\n
[docs]@experimental\ndef execute_docker_container(\n context: OpExecutionContext,\n image: str,\n entrypoint: Optional[Sequence[str]] = None,\n command: Optional[Sequence[str]] = None,\n networks: Optional[Sequence[str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n env_vars: Optional[Sequence[str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n):\n """This function is a utility for executing a Docker container from within a Dagster op.\n\n Args:\n image (str): The image to use for the launched Docker container.\n entrypoint (Optional[Sequence[str]]): The ENTRYPOINT to run in the launched Docker\n container. Default: None.\n command (Optional[Sequence[str]]): The CMD to run in the launched Docker container.\n Default: None.\n networks (Optional[Sequence[str]]): Names of the Docker networks to which to connect the\n launched container. Default: None.\n registry: (Optional[Mapping[str, str]]): Information for using a non local/public Docker\n registry. Can have "url", "username", or "password" keys.\n env_vars (Optional[Sequence[str]]): List of environemnt variables to include in the launched\n container. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\n pulled from the calling environment.\n container_kwargs (Optional[Dict[str[Any]]]): key-value pairs that can be passed into\n containers.create in the Docker Python API. See\n https://docker-py.readthedocs.io/en/stable/containers.html for the full list\n of available options.\n """\n run_container_context = DockerContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, DockerRunLauncher)\n else None\n ),\n )\n\n validate_docker_image(image)\n\n op_container_context = DockerContainerContext(\n registry=registry, env_vars=env_vars, networks=networks, container_kwargs=container_kwargs\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n client = _get_client(container_context)\n\n try:\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n container.start()\n\n for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n print(line) # noqa: T201\n\n exit_status = container.wait()["StatusCode"]\n\n if exit_status != 0:\n raise Exception(f"Docker container returned exit code {exit_status}")
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=DOCKER_CONTAINER_OP_CONFIG)\n@experimental\ndef docker_container_op(context):\n """An op that runs a Docker container using the docker Python API.\n\n Contrast with the `docker_executor`, which runs each Dagster op in a Dagster job in its\n own Docker container.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in docker.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_docker_container_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_docker_container` function\n inside your own op.\n """\n execute_docker_container(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_docker/ops/docker_container_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.ops.docker_container_op"}}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.pipes

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Union\n\nimport docker\nfrom dagster import (\n    OpExecutionContext,\n    ResourceParam,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n)\nfrom dagster._core.pipes.utils import (\n    PipesEnvContextInjector,\n    extract_message_or_forward_to_stdout,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    DagsterPipesError,\n    PipesDefaultMessageWriter,\n    PipesExtras,\n    PipesParams,\n)\n\n\n
[docs]@experimental\nclass PipesDockerLogsMessageReader(PipesMessageReader):\n @contextmanager\n def read_messages(\n self,\n handler: PipesMessageHandler,\n ) -> Iterator[PipesParams]:\n self._handler = handler\n try:\n yield {PipesDefaultMessageWriter.STDIO_KEY: PipesDefaultMessageWriter.STDERR}\n finally:\n self._handler = None\n\n def consume_docker_logs(self, container) -> None:\n handler = check.not_none(\n self._handler, "Can only consume logs within context manager scope."\n )\n for log_line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n if isinstance(log_line, bytes):\n log_entry = log_line.decode("utf-8")\n elif isinstance(log_line, str):\n log_entry = log_line\n else:\n continue\n\n extract_message_or_forward_to_stdout(handler, log_entry)\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages by extracting them from docker logs directly."
\n\n\n@experimental\nclass _PipesDockerClient(PipesClient):\n """A pipes client that runs external processes in docker containers.\n\n By default context is injected via environment variables and messages are parsed out of the\n log stream, with other logs forwarded to stdout of the orchestration process.\n\n Args:\n env (Optional[Mapping[str, str]]): An optional dict of environment variables to pass to the\n container.\n register (Optional[Mapping[str, str]]): An optional dict of registry credentials to login to\n the docker client.\n context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n context into the docker container process. Defaults to :py:class:`PipesEnvContextInjector`.\n message_reader (Optional[PipesContextInjector]): A message reader to use to read messages\n from the docker container process. Defaults to :py:class:`DockerLogsMessageReader`.\n """\n\n def __init__(\n self,\n env: Optional[Mapping[str, str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n context_injector: Optional[PipesContextInjector] = None,\n message_reader: Optional[PipesMessageReader] = None,\n ):\n self.env = check.opt_mapping_param(env, "env", key_type=str, value_type=str)\n self.registry = check.opt_mapping_param(registry, "registry", key_type=str, value_type=str)\n self.context_injector = (\n check.opt_inst_param(\n context_injector,\n "context_injector",\n PipesContextInjector,\n )\n or PipesEnvContextInjector()\n )\n\n self.message_reader = (\n check.opt_inst_param(message_reader, "message_reader", PipesMessageReader)\n or PipesDockerLogsMessageReader()\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def run(\n self,\n *,\n context: OpExecutionContext,\n image: str,\n extras: Optional[PipesExtras] = None,\n command: Optional[Union[str, Sequence[str]]] = None,\n env: Optional[Mapping[str, str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n ) -> PipesClientCompletedInvocation:\n """Create a docker container and run it to completion, enriched with the pipes protocol.\n\n Args:\n image (str):\n The image for the container to use.\n command (Optional[Union[str, Sequence[str]]]):\n The command for the container use.\n env (Optional[Mapping[str,str]]):\n A mapping of environment variable names to values to set on the first\n container in the pod spec, on top of those configured on resource.\n registry (Optional[Mapping[str, str]]:\n A mapping containing url, username, and password to be used\n with docker client login.\n container_kwargs (Optional[Mapping[str, Any]]:\n Arguments to be forwarded to docker client containers.create.\n extras (Optional[PipesExtras]):\n Extra values to pass along as part of the ext protocol.\n context_injector (Optional[PipesContextInjector]):\n Override the default ext protocol context injection.\n message_reader (Optional[PipesMessageReader]):\n Override the default ext protocol message reader.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """\n with open_pipes_session(\n context=context,\n context_injector=self.context_injector,\n message_reader=self.message_reader,\n extras=extras,\n ) as pipes_session:\n client = docker.client.from_env()\n registry = registry or self.registry\n if registry:\n client.login(\n registry=registry["url"],\n username=registry["username"],\n password=registry["password"],\n )\n\n try:\n container = self._create_container(\n client=client,\n image=image,\n command=command,\n env=env,\n open_pipes_session_env=pipes_session.get_bootstrap_env_vars(),\n container_kwargs=container_kwargs,\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = self._create_container(\n client=client,\n image=image,\n command=command,\n env=env,\n open_pipes_session_env=pipes_session.get_bootstrap_env_vars(),\n container_kwargs=container_kwargs,\n )\n\n result = container.start()\n try:\n if isinstance(self.message_reader, PipesDockerLogsMessageReader):\n self.message_reader.consume_docker_logs(container)\n\n result = container.wait()\n if result["StatusCode"] != 0:\n raise DagsterPipesError(f"Container exited with non-zero status code: {result}")\n finally:\n container.stop()\n return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n def _create_container(\n self,\n client,\n image: str,\n command: Optional[Union[str, Sequence[str]]],\n env: Optional[Mapping[str, str]],\n container_kwargs: Optional[Mapping[str, Any]],\n open_pipes_session_env: Mapping[str, str],\n ):\n kwargs = dict(container_kwargs or {})\n kwargs_env = kwargs.pop("environment", {})\n return client.containers.create(\n image=image,\n command=command,\n detach=True,\n environment={\n **open_pipes_session_env,\n **(self.env or {}),\n **(env or {}),\n **kwargs_env,\n },\n **kwargs,\n )\n\n\nPipesDockerClient = ResourceParam[_PipesDockerClient]\n
", "current_page_name": "_modules/dagster_docker/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.pipes"}}, "dagster_duckdb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Optional, Sequence, Type, cast\n\nimport duckdb\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\nDUCKDB_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_duckdb_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n DuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import build_duckdb_io_manager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n duckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\n specified by including a "schema" entry in output metadata. If none of these is provided, the schema will\n default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=DuckDBIOManager.to_config_schema())\n def duckdb_io_manager(init_context):\n """IO Manager for storing outputs in a DuckDB database.\n\n Assets will be stored in the schema and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the schema specified by output metadata (defaults to public) in a\n table of the name of the output.\n """\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=DuckDbClient(),\n io_manager_name="DuckDBIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return duckdb_io_manager
\n\n\n
[docs]class DuckDBIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If none\n of these is provided, the schema will default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n Set DuckDB configuration options using the config field. See\n https://duckdb.org/docs/sql/configuration.html for all available settings.\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb",\n config={"arrow_large_buffer_size": True})}\n )\n\n """\n\n database: str = Field(description="Path to the DuckDB database.")\n config: Dict[str, Any] = Field(description="DuckDB configuration options.", default={})\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=DuckDbClient(),\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n io_manager_name="DuckDBIOManager",\n )
\n\n\nclass DuckDbClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except duckdb.CatalogException:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.execute(f"create schema if not exists {table_slice.schema};")\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"SELECT {col_str} FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.schema}.{table_slice.table}"""\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={\n "database": context.resource_config["database"],\n "read_only": False,\n "config": context.resource_config["config"],\n },\n max_retries=10,\n )\n\n yield conn\n\n conn.close()\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"DELETE FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(DUCKDB_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(DUCKDB_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_duckdb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.resource

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict\n\nimport duckdb\nfrom dagster import ConfigurableResource\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\n\n
[docs]class DuckDBResource(ConfigurableResource):\n """Resource for interacting with a DuckDB database.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_duckdb import DuckDBResource\n\n @asset\n def my_table(duckdb: DuckDBResource):\n with duckdb.get_connection() as conn:\n conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\n defs = Definitions(\n assets=[my_table],\n resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n )\n\n """\n\n database: str = Field(\n description=(\n "Path to the DuckDB database. Setting database=':memory:' will use an in-memory"\n " database "\n )\n )\n config: Dict[str, Any] = Field(description="DuckDB configuration options.", default={})\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_connection(self):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": self.database, "read_only": False, "config": self.config},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()
\n
", "current_page_name": "_modules/dagster_duckdb/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.resource"}}, "dagster_duckdb_pandas": {"duckdb_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pandas.duckdb_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\n\n\n
[docs]class DuckDBPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Stores and loads Pandas DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in duckdb."""\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n return connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nduckdb_pandas_io_manager = build_duckdb_io_manager(\n [DuckDBPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nduckdb_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPandasIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\n using the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pandas/duckdb_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pandas.duckdb_pandas_type_handler"}}, "dagster_duckdb_polars": {"duckdb_polars_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_polars.duckdb_polars_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport polars as pl\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager\n\n\n
[docs]class DuckDBPolarsTypeHandler(DbTypeHandler[pl.DataFrame]):\n """Stores and loads Polars DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pl.DataFrame, connection\n ):\n """Stores the polars DataFrame in duckdb."""\n obj_arrow = obj.to_arrow() # noqa: F841 # need obj_arrow symbol to exist for duckdb query\n connection.execute(f"create schema if not exists {table_slice.schema};")\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj_arrow;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj_arrow"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype))\n for name, dtype in zip(obj.columns, obj.dtypes)\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pl.DataFrame:\n """Loads the input as a Polars DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pl.DataFrame()\n select_statement = connection.execute(\n DuckDbClient.get_select_statement(table_slice=table_slice)\n )\n duckdb_to_arrow = select_statement.arrow()\n return pl.DataFrame(duckdb_to_arrow)\n\n @property\n def supported_types(self):\n return [pl.DataFrame]
\n\n\nduckdb_polars_io_manager = build_duckdb_io_manager(\n [DuckDBPolarsTypeHandler()], default_load_type=pl.DataFrame\n)\nduckdb_polars_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_polars import duckdb_polars_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPolarsIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\n using the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\n as Polars DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pl.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_polars/duckdb_polars_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_polars.duckdb_polars_type_handler"}}, "dagster_duckdb_pyspark": {"duckdb_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pyspark.duckdb_pyspark_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pyarrow as pa\nimport pyspark\nimport pyspark.sql\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef pyspark_df_to_arrow_table(df: pyspark.sql.DataFrame) -> pa.Table:\n    """Converts a PySpark DataFrame to a PyArrow Table."""\n    # `_collect_as_arrow` API call sourced from:\n    #   https://stackoverflow.com/questions/73203318/how-to-transform-spark-dataframe-to-polars-dataframe\n    return pa.Table.from_batches(df._collect_as_arrow())  # noqa: SLF001\n\n\n
[docs]class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):\n """Stores PySpark DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n """\n\n def handle_output(\n self,\n context: OutputContext,\n table_slice: TableSlice,\n obj: pyspark.sql.DataFrame,\n connection,\n ):\n """Stores the given object at the provided filepath."""\n pa_df = pyspark_df_to_arrow_table(obj) # noqa: F841\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " pa_df;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from pa_df;"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.count(),\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) for name, dtype in obj.dtypes\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pyspark.sql.DataFrame:\n """Loads the return of the query as the correct type."""\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n pd_df = connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n return spark.createDataFrame(pd_df)\n\n @property\n def supported_types(self):\n return [pyspark.sql.DataFrame]
\n\n\nduckdb_pyspark_io_manager = build_duckdb_io_manager(\n [DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame\n)\nduckdb_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPySparkIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\n using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pyspark.sql.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pyspark/duckdb_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pyspark.duckdb_pyspark_type_handler"}}, "dagster_embedded_elt": {"sling": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.asset_defs

\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom dagster import (\n    AssetExecutionContext,\n    AssetsDefinition,\n    AssetSpec,\n    MaterializeResult,\n    multi_asset,\n)\nfrom dagster._annotations import experimental\n\nfrom dagster_embedded_elt.sling.resources import SlingMode, SlingResource\n\n\n
[docs]@experimental\ndef build_sling_asset(\n asset_spec: AssetSpec,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[Union[str, List[str]]] = None,\n update_key: Optional[Union[str, List[str]]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n sling_resource_key: str = "sling",\n) -> AssetsDefinition:\n """Asset Factory for using Sling to sync data from a source stream to a target object.\n\n Args:\n asset_spec (AssetSpec): The AssetSpec to use to materialize this asset.\n source_stream (str): The source stream to sync from. This can be a table, a query, or a path.\n target_object (str): The target object to sync to. This can be a table, or a path.\n mode (SlingMode, optional): The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.\n primary_key (Optional[Union[str, List[str]]], optional): The optional primary key to use when syncing.\n update_key (Optional[Union[str, List[str]]], optional): The optional update key to use when syncing.\n source_options (Optional[Dict[str, Any]], optional): Any optional Sling source options to use when syncing.\n target_options (Optional[Dict[str, Any]], optional): Any optional target options to use when syncing.\n sling_resource_key (str, optional): The resource key for the SlingResource. Defaults to "sling".\n\n Examples:\n Creating a Sling asset that syncs from a file to a table:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key=["main", "dest_tbl"])\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="file:///tmp/test.csv",\n target_object="main.dest_table",\n mode=SlingMode.INCREMENTAL,\n primary_key="id"\n )\n\n Creating a Sling asset that syncs from a table to a file with a full refresh:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key="test.csv")\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="main.dest_table",\n target_object="file:///tmp/test.csv",\n mode=SlingMode.FULL_REFRESH\n )\n\n\n """\n if primary_key is not None and not isinstance(primary_key, list):\n primary_key = [primary_key]\n\n if update_key is not None and not isinstance(update_key, list):\n update_key = [update_key]\n\n @multi_asset(\n compute_kind="sling", specs=[asset_spec], required_resource_keys={sling_resource_key}\n )\n def sync(context: AssetExecutionContext) -> MaterializeResult:\n sling: SlingResource = getattr(context.resources, sling_resource_key)\n last_row_count_observed = None\n for stdout_line in sling.sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n ):\n match = re.search(r"(\\d+) rows", stdout_line)\n if match:\n last_row_count_observed = int(match.group(1))\n context.log.info(stdout_line)\n\n return MaterializeResult(\n metadata=(\n {} if last_row_count_observed is None else {"row_count": last_row_count_observed}\n )\n )\n\n return sync
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.asset_defs"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.resources

\nimport contextlib\nimport json\nimport re\nfrom enum import Enum\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Any, Dict, Generator, List, Optional\n\nfrom dagster import ConfigurableResource, PermissiveConfig, get_dagster_logger\nfrom dagster._annotations import experimental\nfrom dagster._utils.env import environ\nfrom pydantic import Field\nfrom sling import Sling\n\nlogger = get_dagster_logger()\n\n\nclass SlingMode(str, Enum):\n    """The mode to use when syncing.\n\n    See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n    """\n\n    INCREMENTAL = "incremental"\n    TRUNCATE = "truncate"\n    FULL_REFRESH = "full-refresh"\n    SNAPSHOT = "snapshot"\n\n\n
[docs]class SlingSourceConnection(PermissiveConfig):\n """A Sling Source Connection defines the source connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Source for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingSourceConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n source = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema")\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n """\n\n type: str = Field(description="Type of the source connection. Use 'file' for local storage.")\n connection_string: Optional[str] = Field(\n description="The connection string for the source database."\n )
\n\n\n
[docs]class SlingTargetConnection(PermissiveConfig):\n """A Sling Target Connection defines the target connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Target for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block::python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n\n """\n\n type: str = Field(\n description="Type of the destination connection. Use 'file' for local storage."\n )\n connection_string: Optional[str] = Field(\n description="The connection string for the target database."\n )
\n\n\n
[docs]@experimental\nclass SlingResource(ConfigurableResource):\n """Resource for interacting with the Sling package.\n\n Examples:\n .. code-block:: python\n\n from dagster_etl.sling import SlingResource\n sling_resource = SlingResource(\n source_connection=SlingSourceConnection(\n type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n ),\n target_connection=SlingTargetConnection(\n type="snowflake",\n host="host",\n user="user",\n database="database",\n password="password",\n role="role",\n ),\n )\n\n """\n\n source_connection: SlingSourceConnection\n target_connection: SlingTargetConnection\n\n @contextlib.contextmanager\n def _setup_config(self) -> Generator[None, None, None]:\n """Uses environment variables to set the Sling source and target connections."""\n sling_source = self.source_connection.dict()\n sling_target = self.target_connection.dict()\n if self.source_connection.connection_string:\n sling_source["url"] = self.source_connection.connection_string\n if self.target_connection.connection_string:\n sling_target["url"] = self.target_connection.connection_string\n with environ(\n {\n "SLING_SOURCE": json.dumps(sling_source),\n "SLING_TARGET": json.dumps(sling_target),\n }\n ):\n yield\n\n @staticmethod\n def _exec_sling_cmd(cmd, stdin=None, stdout=PIPE, stderr=STDOUT) -> Generator[str, None, None]:\n ansi_escape = re.compile(r"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])")\n with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:\n assert proc.stdout\n\n for line in proc.stdout:\n fmt_line = str(line, "utf-8")\n clean_line = ansi_escape.sub("", fmt_line).replace("INF", "")\n yield clean_line\n\n proc.wait()\n if proc.returncode != 0:\n raise Exception("Sling command failed with error code %s", proc.returncode)\n\n def _sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Runs a Sling sync from the given source table to the given destination table. Generates\n output lines from the Sling CLI.\n """\n if self.source_connection.type == "file" and not source_stream.startswith("file://"):\n source_stream = "file://" + source_stream\n\n if self.target_connection.type == "file" and not target_object.startswith("file://"):\n target_object = "file://" + target_object\n\n with self._setup_config():\n config = {\n "source": {\n "conn": "SLING_SOURCE",\n "stream": source_stream,\n "primary_key": primary_key,\n "update_key": update_key,\n "options": source_options,\n },\n "target": {\n "conn": "SLING_TARGET",\n "object": target_object,\n "options": target_options,\n },\n }\n config["source"] = {k: v for k, v in config["source"].items() if v is not None}\n config["target"] = {k: v for k, v in config["target"].items() if v is not None}\n\n sling_cli = Sling(**config)\n logger.info("Starting Sling sync with mode: %s", mode)\n cmd = sling_cli._prep_cmd() # noqa: SLF001\n\n yield from self._exec_sling_cmd(cmd)\n\n def sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Initiate a Sling Sync between a source stream and a target object.\n\n Args:\n source_stream (str): The source stream to read from. For database sources, the source stream can be either\n a table name, a SQL statement or a path to a SQL file e.g. `TABLE1` or `SCHEMA1.TABLE2` or\n `SELECT * FROM TABLE`. For file sources, the source stream is a path or an url to a file.\n For file targets, the target object is a path or a url to a file, e.g. file:///tmp/file.csv or\n s3://my_bucket/my_folder/file.csv\n target_object (str): The target object to write into. For database targets, the target object is a table\n name, e.g. TABLE1, SCHEMA1.TABLE2. For file targets, the target object is a path or an url to a file.\n mode (SlingMode): The Sling mode to use when syncing, i.e. incremental, full-refresh\n See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n primary_key (str): For incremental syncs, a primary key is used during merge statements to update\n existing rows.\n update_key (str): For incremental syncs, an update key is used to stream records after max(update_key)\n source_options (Dict[str, Any]): Other source options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#source-options-src-options-flag-source.options-key\n for details\n target_options (Dict[str, Any[): Other target options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#target-options-tgt-options-flag-target.options-key\n for details\n\n Examples:\n Sync from a source file to a sqlite database:\n\n .. code-block:: python\n\n sqllite_path = "/path/to/sqlite.db"\n csv_path = "/path/to/file.csv"\n\n @asset\n def run_sync(context, sling: SlingResource):\n res = sling.sync(\n source_stream=csv_path,\n target_object="events",\n mode=SlingMode.FULL_REFRESH,\n )\n for stdout in res:\n context.log.debug(stdout)\n counts = sqlite3.connect(sqllitepath).execute("SELECT count(1) FROM events").fetchone()\n assert counts[0] == 3\n\n source = SlingSourceConnection(\n type="file",\n )\n target = SlingTargetConnection(type="sqlite", instance=sqllitepath)\n\n materialize(\n [run_sync],\n resources={\n "sling": SlingResource(\n source_connection=source,\n target_connection=target,\n mode=SlingMode.TRUNCATE,\n )\n },\n )\n\n """\n yield from self._sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n )
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.resources"}}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.asset_defs

\nimport hashlib\nimport inspect\nimport re\nfrom functools import partial\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    OpExecutionContext,\n    Output,\n    _check as check,\n    multi_asset,\n)\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterStepOutputNotFoundError\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.utils import (\n    generate_materializations,\n    get_fivetran_connector_url,\n    metadata_for_table,\n)\n\n\ndef _build_fivetran_assets(\n    connector_id: str,\n    destination_tables: Sequence[str],\n    poll_interval: float = DEFAULT_POLL_INTERVAL,\n    poll_timeout: Optional[float] = None,\n    io_manager_key: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n    table_to_asset_key_map: Optional[Mapping[str, AssetKey]] = None,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n    group_name: Optional[str] = None,\n    infer_missing_tables: bool = False,\n    op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n    asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n    tracked_asset_keys = {\n        table: AssetKey([*asset_key_prefix, *table.split(".")]) for table in destination_tables\n    }\n    user_facing_asset_keys = table_to_asset_key_map or tracked_asset_keys\n\n    _metadata_by_table_name = check.opt_mapping_param(\n        metadata_by_table_name, "metadata_by_table_name", key_type=str\n    )\n\n    @multi_asset(\n        name=f"fivetran_sync_{connector_id}",\n        outs={\n            "_".join(key.path): AssetOut(\n                io_manager_key=io_manager_key,\n                key=user_facing_asset_keys[table],\n                metadata=_metadata_by_table_name.get(table),\n                dagster_type=Nothing,\n            )\n            for table, key in tracked_asset_keys.items()\n        },\n        compute_kind="fivetran",\n        resource_defs=resource_defs,\n        group_name=group_name,\n        op_tags=op_tags,\n    )\n    def _assets(context: OpExecutionContext, fivetran: FivetranResource) -> Any:\n        fivetran_output = fivetran.sync_and_poll(\n            connector_id=connector_id,\n            poll_interval=poll_interval,\n            poll_timeout=poll_timeout,\n        )\n\n        materialized_asset_keys = set()\n        for materialization in generate_materializations(\n            fivetran_output, asset_key_prefix=asset_key_prefix\n        ):\n            # scan through all tables actually created, if it was expected then emit an Output.\n            # otherwise, emit a runtime AssetMaterialization\n            if materialization.asset_key in tracked_asset_keys.values():\n                yield Output(\n                    value=None,\n                    output_name="_".join(materialization.asset_key.path),\n                    metadata=materialization.metadata,\n                )\n                materialized_asset_keys.add(materialization.asset_key)\n\n            else:\n                yield materialization\n\n        unmaterialized_asset_keys = set(tracked_asset_keys.values()) - materialized_asset_keys\n        if infer_missing_tables:\n            for asset_key in unmaterialized_asset_keys:\n                yield Output(\n                    value=None,\n                    output_name="_".join(asset_key.path),\n                )\n\n        else:\n            if unmaterialized_asset_keys:\n                asset_key = next(iter(unmaterialized_asset_keys))\n                output_name = "_".join(asset_key.path)\n                raise DagsterStepOutputNotFoundError(\n                    f"Core compute for {context.op_def.name} did not return an output for"\n                    f' non-optional output "{output_name}".',\n                    step_key=context.get_step_execution_context().step.key,\n                    output_name=output_name,\n                )\n\n    return [_assets]\n\n\n
[docs]def build_fivetran_assets(\n connector_id: str,\n destination_tables: Sequence[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[Sequence[str]] = None,\n metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n group_name: Optional[str] = None,\n infer_missing_tables: bool = False,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n """Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefinition which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]): A mapping from destination\n table name to user-supplied metadata that should be associated with the asset for that table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n infer_missing_tables (bool): If True, will create asset materializations for tables specified\n in destination_tables even if they are not present in the Fivetran sync output. This is useful\n in cases where Fivetran does not sync any data for a table and therefore does not include it\n in the sync output API response.\n op_tags (Optional[Dict[str, Any]]):\n A dictionary of tags for the op that computes the asset. Frameworks may expect and\n require certain metadata to be attached to a op. Values that are not strings will be\n json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.\n\n **Examples:**\n\n Basic example:\n\n .. code-block:: python\n\n from dagster import AssetKey, repository, with_resources\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n Attaching metadata:\n\n .. code-block:: python\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n metadata_by_table_name={\n "schema1.table1": {\n "description": "This is a table that contains foo and bar",\n },\n "schema2.table2": {\n "description": "This is a table that contains baz and quux",\n },\n },\n )\n """\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=destination_tables,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n io_manager_key=io_manager_key,\n asset_key_prefix=asset_key_prefix,\n metadata_by_table_name=metadata_by_table_name,\n group_name=group_name,\n infer_missing_tables=infer_missing_tables,\n op_tags=op_tags,\n )
\n\n\nclass FivetranConnectionMetadata(\n NamedTuple(\n "_FivetranConnectionMetadata",\n [\n ("name", str),\n ("connector_id", str),\n ("connector_url", str),\n ("schemas", Mapping[str, Any]),\n ],\n )\n):\n def build_asset_defn_metadata(\n self,\n key_prefix: Sequence[str],\n group_name: Optional[str],\n table_to_asset_key_fn: Callable[[str], AssetKey],\n io_manager_key: Optional[str] = None,\n ) -> AssetsDefinitionCacheableData:\n schema_table_meta: Dict[str, MetadataUserInput] = {}\n if "schemas" in self.schemas:\n schemas_inner = cast(Dict[str, Any], self.schemas["schemas"])\n for schema in schemas_inner.values():\n if schema["enabled"]:\n schema_name = schema["name_in_destination"]\n schema_tables = cast(Dict[str, Dict[str, Any]], schema["tables"])\n for table in schema_tables.values():\n if table["enabled"]:\n table_name = table["name_in_destination"]\n schema_table_meta[f"{schema_name}.{table_name}"] = metadata_for_table(\n table, self.connector_url\n )\n else:\n schema_table_meta[self.name] = {}\n\n outputs = {\n table: AssetKey([*key_prefix, *list(table_to_asset_key_fn(table).path)])\n for table in schema_table_meta.keys()\n }\n\n internal_deps: Dict[str, Set[AssetKey]] = {}\n\n return AssetsDefinitionCacheableData(\n keys_by_input_name={},\n keys_by_output_name=outputs,\n internal_asset_deps=internal_deps,\n group_name=group_name,\n key_prefix=key_prefix,\n can_subset=False,\n metadata_by_output_name=schema_table_meta,\n extra_metadata={\n "connector_id": self.connector_id,\n "io_manager_key": io_manager_key,\n },\n )\n\n\ndef _build_fivetran_assets_from_metadata(\n assets_defn_meta: AssetsDefinitionCacheableData,\n resource_defs: Mapping[str, ResourceDefinition],\n poll_interval: float,\n poll_timeout: Optional[float] = None,\n) -> AssetsDefinition:\n metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n connector_id = cast(str, metadata["connector_id"])\n io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=list(\n assets_defn_meta.keys_by_output_name.keys()\n if assets_defn_meta.keys_by_output_name\n else []\n ),\n asset_key_prefix=list(assets_defn_meta.key_prefix or []),\n metadata_by_table_name=cast(\n Dict[str, MetadataUserInput], assets_defn_meta.metadata_by_output_name\n ),\n io_manager_key=io_manager_key,\n table_to_asset_key_map=assets_defn_meta.keys_by_output_name,\n resource_defs=resource_defs,\n group_name=assets_defn_meta.group_name,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )[0]\n\n\nclass FivetranInstanceCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n fivetran_resource_def: Union[FivetranResource, ResourceDefinition],\n key_prefix: Sequence[str],\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]],\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connector_to_asset_key_fn: Optional[Callable[[FivetranConnectionMetadata, str], AssetKey]],\n poll_interval: float,\n poll_timeout: Optional[float],\n ):\n self._fivetran_resource_def = fivetran_resource_def\n self._fivetran_instance: FivetranResource = (\n fivetran_resource_def.process_config_and_initialize()\n if isinstance(fivetran_resource_def, FivetranResource)\n else fivetran_resource_def(build_init_resource_context())\n )\n\n self._key_prefix = key_prefix\n self._connector_to_group_fn = connector_to_group_fn\n self._connection_filter = connector_filter\n self._connector_to_io_manager_key_fn = connector_to_io_manager_key_fn\n self._connector_to_asset_key_fn: Callable[[FivetranConnectionMetadata, str], AssetKey] = (\n connector_to_asset_key_fn or (lambda _, table: AssetKey(path=table.split(".")))\n )\n self._poll_interval = poll_interval\n self._poll_timeout = poll_timeout\n\n contents = hashlib.sha1()\n contents.update(",".join(key_prefix).encode("utf-8"))\n if connector_filter:\n contents.update(inspect.getsource(connector_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"fivetran-{contents.hexdigest()}")\n\n def _get_connectors(self) -> Sequence[FivetranConnectionMetadata]:\n output_connectors: List[FivetranConnectionMetadata] = []\n\n groups = self._fivetran_instance.make_request("GET", "groups")["items"]\n\n for group in groups:\n group_id = group["id"]\n\n connectors = self._fivetran_instance.make_request(\n "GET", f"groups/{group_id}/connectors"\n )["items"]\n for connector in connectors:\n connector_id = connector["id"]\n\n connector_name = connector["schema"]\n\n setup_state = connector.get("status", {}).get("setup_state")\n if setup_state and setup_state in ("incomplete", "broken"):\n continue\n\n connector_url = get_fivetran_connector_url(connector)\n\n schemas = self._fivetran_instance.make_request(\n "GET", f"connectors/{connector_id}/schemas"\n )\n\n output_connectors.append(\n FivetranConnectionMetadata(\n name=connector_name,\n connector_id=connector_id,\n connector_url=connector_url,\n schemas=schemas,\n )\n )\n\n return output_connectors\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connector in self._get_connectors():\n if not self._connection_filter or self._connection_filter(connector):\n table_to_asset_key = partial(self._connector_to_asset_key_fn, connector)\n asset_defn_data.append(\n connector.build_asset_defn_metadata(\n key_prefix=self._key_prefix,\n group_name=(\n self._connector_to_group_fn(connector.name)\n if self._connector_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connector_to_io_manager_key_fn(connector.name)\n if self._connector_to_io_manager_key_fn\n else None\n ),\n table_to_asset_key_fn=table_to_asset_key,\n )\n )\n\n return asset_defn_data\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return [\n _build_fivetran_assets_from_metadata(\n meta,\n {"fivetran": self._fivetran_instance.get_resource_definition()},\n poll_interval=self._poll_interval,\n poll_timeout=self._poll_timeout,\n )\n for meta in data\n ]\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\n
[docs]def load_assets_from_fivetran_instance(\n fivetran: Union[FivetranResource, ResourceDefinition],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]] = None,\n connector_to_asset_key_fn: Optional[\n Callable[[FivetranConnectionMetadata, str], AssetKey]\n ] = None,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n) -> CacheableAssetsDefinition:\n """Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\n about defined connectors at initialization time, and will error on workspace load if the Fivetran\n instance is not reachable.\n\n Args:\n fivetran (ResourceDefinition): A FivetranResource configured with the appropriate connection\n details.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n connector_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Fivetran connector name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.\n connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]): Optional function which takes\n in connector metadata and returns False if the connector should be excluded from the output assets.\n connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]): Optional function\n which takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\n a function that generates an AssetKey matching the table name, split by ".".\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n\n **Examples:**\n\n Loading all Fivetran connectors as assets:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n\n Filtering the set of loaded connectors:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(\n fivetran_instance,\n connector_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connector_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connector_to_io_manager_key_fn",\n )\n if not connector_to_io_manager_key_fn:\n connector_to_io_manager_key_fn = lambda _: io_manager_key\n\n return FivetranInstanceCacheableAssetsDefinition(\n fivetran_resource_def=fivetran,\n key_prefix=key_prefix,\n connector_to_group_fn=connector_to_group_fn,\n connector_to_io_manager_key_fn=connector_to_io_manager_key_fn,\n connector_filter=connector_filter,\n connector_to_asset_key_fn=connector_to_asset_key_fn,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import (\n    AssetKey,\n    Config,\n    In,\n    Nothing,\n    Out,\n    Output,\n    op,\n)\nfrom pydantic import Field\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\n\nclass SyncConfig(Config):\n    connector_id: str = Field(\n        description=(\n            "The Fivetran Connector ID that this op will sync. You can retrieve this "\n            'value from the "Setup" tab of a given connector in the Fivetran UI.'\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the Fivetran sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["fivetran"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " sync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(config: SyncConfig, fivetran: FivetranResource) -> Any:\n """Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.sync_and_poll(\n connector_id=config.connector_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(fivetran_output)
\n\n\nclass FivetranResyncConfig(SyncConfig):\n resync_parameters: Optional[Dict[str, Any]] = Field(\n None,\n description=(\n "Optional resync parameters to send in the payload to the Fivetran API. You can"\n " find an example resync payload here:"\n " https://fivetran.com/docs/rest-api/connectors#request_7"\n ),\n )\n\n\n@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " resync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(\n config: FivetranResyncConfig,\n fivetran: FivetranResource,\n) -> Any:\n """Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.resync_and_poll(\n connector_id=config.connector_id,\n resync_parameters=config.resync_parameters,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n asset_key_filter = (\n [\n AssetKey(config.asset_key_prefix + [schema, table])\n for schema, tables in config.resync_parameters.items()\n for table in tables\n ]\n if config.resync_parameters is not None\n else None\n )\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n ):\n if asset_key_filter is None or mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional, Sequence, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster import (\n    Failure,\n    InitResourceContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dateutil import parser\nfrom pydantic import Field\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_API_VERSION_PATH = "v1/"\nFIVETRAN_CONNECTOR_PATH = "connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource(ConfigurableResource):\n """This class exposes methods on top of the Fivetran REST API."""\n\n api_key: str = Field(description="The Fivetran API key to use for this resource.")\n api_secret: str = Field(description="The Fivetran API secret to use for this resource.")\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the Fivetran API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n def _auth(self) -> HTTPBasicAuth:\n return HTTPBasicAuth(self.api_key, self.api_secret)\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_API_VERSION_PATH)\n\n @property\n def api_connector_url(self) -> str:\n return urljoin(self.api_base_url, FIVETRAN_CONNECTOR_PATH)\n\n def make_connector_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n return self.make_request(method, urljoin(FIVETRAN_CONNECTOR_PATH, endpoint), data)\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n url = urljoin(self.api_base_url, endpoint)\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def get_connector_details(self, connector_id: str) -> Mapping[str, Any]:\n """Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_connector_request(method="GET", endpoint=connector_id)\n\n def _assert_syncable_connector(self, connector_id: str):\n """Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure(f"Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure(f"Connector '{connector_id}' cannot be synced as it has not been setup")\n\n def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )\n\n def update_connector(\n self, connector_id: str, properties: Optional[Mapping[str, Any]] = None\n ) -> Mapping[str, Any]:\n """Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_connector_request(\n method="PATCH", endpoint=connector_id, data=json.dumps(properties)\n )\n\n def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed(f"schedule_type must be either 'auto' or 'manual': got '{schedule_type}'")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})\n\n def get_connector_schema_config(self, connector_id: str) -> Mapping[str, Any]:\n return self.make_connector_request("GET", endpoint=f"{connector_id}/schemas")\n\n def start_sync(self, connector_id: str) -> Mapping[str, Any]:\n """Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def start_resync(\n self, connector_id: str, resync_parameters: Optional[Mapping[str, Sequence[str]]] = None\n ) -> Mapping[str, Any]:\n """Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Optional[Dict[str, List[str]]]): Optional resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_7\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(\n method="POST",\n endpoint=(\n f"{connector_id}/schemas/tables/resync"\n if resync_parameters is not None\n else f"{connector_id}/resync"\n ),\n data=json.dumps(resync_parameters) if resync_parameters is not None else None,\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran"\n " UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after "\n f"{datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details\n\n def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)\n\n def resync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n resync_parameters: Optional[Mapping[str, Sequence[str]]] = None,\n ) -> FivetranOutput:\n """Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=FivetranResource.to_config_schema())\ndef fivetran_resource(context: InitResourceContext) -> FivetranResource:\n """This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource.from_resource_context(context)
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Generator, Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._annotations import experimental\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n    TimeWindow,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom google.api_core.exceptions import NotFound\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\nBIGQUERY_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]@experimental\ndef build_bigquery_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\n If only one DbTypeHandler is provided, it will be used as the default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import build_bigquery_io_manager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n bigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a `schema` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster willstore this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=BigQueryIOManager.to_config_schema())\n def bigquery_io_manager(init_context):\n """I/O Manager for storing outputs in a BigQuery database.\n\n Assets will be stored in the dataset and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the dataset specified by output metadata (defaults to public) in a\n table of the name of the output.\n\n Note that the BigQuery config is mapped to the DB IO manager table hierarchy as follows:\n BigQuery DB IO\n * project -> database\n * dataset -> schema\n * table -> table\n """\n mgr = DbIOManager(\n type_handlers=type_handlers,\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=init_context.resource_config["project"],\n schema=init_context.resource_config.get("dataset"),\n default_load_type=default_load_type,\n )\n if init_context.resource_config.get("gcp_credentials"):\n with setup_gcp_creds(init_context.resource_config.get("gcp_credentials")):\n yield mgr\n else:\n yield mgr\n\n return bigquery_io_manager
\n\n\n
[docs]class BigQueryIOManager(ConfigurableIOManagerFactory):\n """Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a ``schema`` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster will store this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n project: str = Field(description="The GCP project to use.")\n dataset: Optional[str] = Field(\n default=None,\n description=(\n "Name of the BigQuery dataset to use. If not provided, the last prefix before"\n " the asset name will be used."\n ),\n )\n location: Optional[str] = Field(\n default=None,\n description=(\n "The GCP location. Note: When using PySpark DataFrames, the default"\n " location of the project will be used. A custom location can be specified in"\n " your SparkSession configuration."\n ),\n )\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n temporary_gcs_bucket: Optional[str] = Field(\n default=None,\n description=(\n "When using PySpark DataFrames, optionally specify a temporary GCS bucket to"\n " store data. If not provided, data will be directly written to BigQuery."\n ),\n )\n timeout: Optional[float] = Field(\n default=None,\n description=(\n "When using Pandas DataFrames, optionally specify a timeout for the BigQuery"\n " queries (loading and reading from tables)."\n ),\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> Generator:\n mgr = DbIOManager(\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=self.project,\n schema=self.dataset,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield mgr\n else:\n yield mgr
\n\n\nclass BigQueryClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.query(_get_cleanup_statement(table_slice)).result()\n except NotFound:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"""\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.query(f"CREATE SCHEMA IF NOT EXISTS {table_slice.schema}").result()\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = bigquery.Client(\n project=context.resource_config.get("project"),\n location=context.resource_config.get("location"),\n )\n\n yield conn\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"TRUNCATE TABLE `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_gcp/bigquery/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster import (\n    In,\n    List,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.encryption_configuration import EncryptionConfiguration\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import TimePartitioning\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n m = hashlib.sha1()\n for query in sql_queries:\n m.update(query.encode("utf-8"))\n hash_str = m.hexdigest()[:10]\n name = f"bq_op_{hash_str}"\n\n @op(\n name=name,\n ins={_START: In(Nothing)},\n out=Out(List[DataFrame]),\n config_schema=define_bigquery_query_config(),\n required_resource_keys={"bigquery"},\n tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n )\n def _bq_fn(context):\n query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n # Retrieve results as pandas DataFrames\n results = []\n for sql_query in sql_queries:\n # We need to construct a new QueryJobConfig for each query.\n # See: https://bit.ly/2VjD6sl\n cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n context.log.info(\n "executing query %s with config: %s"\n % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n )\n results.append(\n context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n )\n\n return results\n\n return _bq_fn
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n ins={"paths": In(List[str])},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n ins={"df": In(DataFrame)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n ins={"path": In(str)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\n\n
[docs]class BigQueryResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "bigquery": BigQueryResource(project="my-project")\n }\n )\n """\n\n project: Optional[str] = Field(\n default=None,\n description=(\n "Project ID for the project which the client acts on behalf of. Will be passed when"\n " creating a dataset / job. If not passed, falls back to the default inferred from the"\n " environment."\n ),\n )\n\n location: Optional[str] = Field(\n default=None,\n description="Default location for jobs / datasets / tables.",\n )\n\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_client(self) -> Iterator[bigquery.Client]:\n """Context manager to create a BigQuery Client.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n """\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield bigquery.Client(project=self.project, location=self.location)\n\n else:\n yield bigquery.Client(project=self.project, location=self.location)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n with self.get_client() as client:\n yield client
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=BigQueryResource.to_config_schema(),\n description="Dagster resource for connecting to BigQuery",\n)\ndef bigquery_resource(context):\n bq_resource = BigQueryResource.from_resource_context(context)\n with bq_resource.get_client() as client:\n yield client
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\nfrom dagster._config import ConfigScalar, ConfigScalarKind, PostProcessingError\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset"."""\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix.\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                "with optional date-partition suffix"\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom typing import Any, Dict\n\nfrom dagster import (\n    Bool,\n    Config,\n    Field as DagsterField,\n    Int,\n    op,\n)\nfrom dagster._seven import json\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_submit_job_config\nfrom .resources import TWENTY_MINUTES, DataprocResource\n\n# maintain the old config schema because of the nested job_config schema\nDATAPROC_CONFIG_SCHEMA = {\n    "job_timeout_in_seconds": DagsterField(\n        Int,\n        description="""Optional. Maximum time in seconds to wait for the job being\n                    completed. Default is set to 1200 seconds (20 minutes).\n                    """,\n        is_required=False,\n        default_value=TWENTY_MINUTES,\n    ),\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": DagsterField(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\nclass DataprocOpConfig(Config):\n    job_timeout_in_seconds: int = Field(\n        default=TWENTY_MINUTES,\n        description=(\n            "Maximum time in seconds to wait for the job being completed. Default is set to 1200"\n            " seconds (20 minutes)."\n        ),\n    )\n    job_scoped_cluster: bool = Field(\n        default=True,\n        description="Whether to create a cluster or use an existing cluster. Defaults to True.",\n    )\n    project_id: str = Field(\n        description=(\n            "Required. Project ID for the project which the client acts on behalf of. Will be"\n            " passed when creating a dataset/job."\n        )\n    )\n    region: str = Field(description="The GCP region.")\n    job_config: Dict[str, Any] = Field(\n        description="Python dictionary containing configuration for the Dataproc Job."\n    )\n\n\ndef _dataproc_compute(context):\n    job_config = context.op_config["job_config"]\n    job_timeout = context.op_config["job_timeout_in_seconds"]\n\n    context.log.info(\n        "submitting job with config: %s and timeout of: %d seconds"\n        % (str(json.dumps(job_config)), job_timeout)\n    )\n\n    if context.op_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info(f"Submitted job ID {job_id}")\n            cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info(f"Submitted job ID {job_id}")\n        context.resources.dataproc.wait_for_job(job_id, wait_timeout=job_timeout)\n\n\n@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n    return _dataproc_compute(context)\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n\n\n@op\ndef configurable_dataproc_op(context, dataproc: DataprocResource, config: DataprocOpConfig):\n job_config = {"projectId": config.project_id, "region": config.region, "job": config.job_config}\n job_timeout = config.job_timeout_in_seconds\n\n context.log.info(\n "submitting job with config: %s and timeout of: %d seconds"\n % (str(json.dumps(job_config)), job_timeout)\n )\n\n dataproc_client = dataproc.get_client()\n\n if config.job_scoped_cluster:\n # Cluster context manager, creates and then deletes cluster\n with dataproc_client.cluster_context_manager() as cluster:\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = cluster.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n else:\n # Submit to an existing cluster\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = dataproc_client.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n dataproc_client.wait_for_job(job_id, wait_timeout=job_timeout)\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport json\nimport time\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Mapping, Optional\n\nimport dagster._check as check\nimport yaml\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocClient:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id, wait_timeout=TWENTY_MINUTES):\n        """This method polls job status every 5 seconds."""\n\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn, max_wait_time_sec=wait_timeout)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true."""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """Context manager allowing execution with a dataproc cluster.\n\n        Example:\n        .. code-block::\n            with context.resources.dataproc.cluster as cluster:\n                # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]class DataprocResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for connecting to a Dataproc cluster.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(dataproc: DataprocResource):\n with dataproc.get_client() as client:\n # client is a dagster_gcp.DataprocClient\n ...\n """\n\n project_id: str = Field(\n description=(\n "Required. Project ID for the project which the client acts on behalf of. Will be"\n " passed when creating a dataset/job."\n )\n )\n region: str = Field(description="The GCP region.")\n cluster_name: str = Field(\n description=(\n "Required. The cluster name. Cluster names within a project must be unique. Names of"\n " deleted clusters can be reused."\n )\n )\n cluster_config_yaml_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a YAML file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_json_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a JSON file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_dict: Optional[Dict[str, Any]] = Field(\n default=None,\n description=(\n "Python dictionary containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _read_yaml_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return yaml.safe_load(f)\n\n def _read_json_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return json.load(f)\n\n def _get_cluster_config(self) -> Optional[Mapping[str, Any]]:\n methods = 0\n methods += 1 if self.cluster_config_dict is not None else 0\n methods += 1 if self.cluster_config_json_path is not None else 0\n methods += 1 if self.cluster_config_yaml_path is not None else 0\n\n # ensure that at most 1 method is provided\n check.invariant(\n methods <= 1,\n "Dataproc Resource: Incorrect config: Cannot provide cluster config multiple ways."\n " Choose one of cluster_config_dict, cluster_config_json_path, or"\n " cluster_config_yaml_path",\n )\n\n cluster_config = None\n if self.cluster_config_json_path:\n cluster_config = self._read_json_config(self.cluster_config_json_path)\n elif self.cluster_config_yaml_path:\n cluster_config = self._read_yaml_config(self.cluster_config_yaml_path)\n elif self.cluster_config_dict:\n cluster_config = self.cluster_config_dict\n\n return cluster_config\n\n def get_client(self) -> DataprocClient:\n cluster_config = self._get_cluster_config()\n\n client_config_dict = {\n "projectId": self.project_id,\n "region": self.region,\n "clusterName": self.cluster_name,\n "cluster_config": cluster_config,\n }\n\n return DataprocClient(config=client_config_dict)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocClient(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.compute_log_manager

\nimport datetime\nimport json\nimport os\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom google.cloud import storage\nfrom typing_extensions import Self\n\n\n
[docs]class GCSComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to GCS.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_gcp.gcs.compute_log_manager\n class: GCSComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n upload_interval: 30\n\n There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage\n\n Args:\n bucket (str): The name of the GCS bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n json_credentials_envvar (Optional[str]): Environment variable that contains the JSON with a private key\n and other credentials information. If this is set, ``GOOGLE_APPLICATION_CREDENTIALS`` will be ignored.\n Can be used when the private key cannot be used as a file.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when instantiated from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n json_credentials_envvar=None,\n upload_interval=None,\n ):\n self._bucket_name = check.str_param(bucket, "bucket")\n self._prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n if json_credentials_envvar:\n json_info_str = os.environ.get(json_credentials_envvar)\n credentials_info = json.loads(json_info_str) # type: ignore # (possible none)\n self._bucket = (\n storage.Client()\n .from_service_account_info(credentials_info)\n .bucket(self._bucket_name)\n )\n else:\n self._bucket = storage.Client().bucket(self._bucket_name)\n\n # Check if the bucket exists\n check.invariant(self._bucket.exists())\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "json_credentials_envvar": Field(StringSource, is_required=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return GCSComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _gcs_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._prefix, "storage", *namespace, filename]\n return "/".join(paths)\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self._local_manager.delete_logs(log_key, prefix)\n if log_key:\n gcs_keys_to_remove = [\n self._gcs_key(log_key, ComputeIOType.STDOUT),\n self._gcs_key(log_key, ComputeIOType.STDERR),\n self._gcs_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._gcs_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n # if the blob doesn't exist, do nothing instead of raising a not found exception\n self._bucket.delete_blobs(gcs_keys_to_remove, on_error=lambda _: None)\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n delete_prefix = "/".join([self._prefix, "storage", *prefix, ""])\n to_delete = self._bucket.list_blobs(prefix=delete_prefix)\n self._bucket.delete_blobs(list(to_delete))\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n gcs_key = self._gcs_key(log_key, io_type)\n try:\n return self._bucket.blob(gcs_key).generate_signed_url(\n expiration=datetime.timedelta(minutes=60)\n )\n except:\n # fallback to the local download url if the current credentials are insufficient to create\n # signed urls\n return self.local_manager.get_captured_log_download_url(log_key, io_type)\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n gcs_key = self._gcs_key(log_key, io_type)\n return f"gs://{self._bucket_name}/{gcs_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n gcs_key = self._gcs_key(log_key, io_type, partial)\n return self._bucket.blob(gcs_key).exists()\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if partial and os.stat(path).st_size == 0:\n return\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n self._bucket.blob(gcs_key).upload_from_file(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._bucket.blob(gcs_key).download_to_file(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_gcp/gcs/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import Optional\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\nfrom google.cloud import storage\n\n\n
[docs]class GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return f"gs://{self.gcs_bucket}/{self.gcs_key}"
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", key=key, ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(key + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return f"{self._gcs_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\nfrom typing import Any, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.backoff import backoff\nfrom dagster._utils.cached_method import cached_method\nfrom google.api_core.exceptions import Forbidden, ServiceUnavailable, TooManyRequests\nfrom google.cloud import storage\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import GCSResource\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(UPathIOManager):\n    def __init__(self, bucket: str, client: Optional[Any] = None, prefix: str = "dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n        super().__init__(base_path=UPath(self.prefix))\n\n    def unlink(self, path: UPath) -> None:\n        key = str(path)\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def path_exists(self, path: UPath) -> bool:\n        key = str(path)\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading GCS object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing GCS object at: {self._uri_for_path(path)}"\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"gs://{self.bucket}/{path}"\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in GCP\n        return None\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        bytes_obj = self.bucket_obj.blob(str(path)).download_as_bytes()\n        return pickle.loads(bytes_obj)\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing GCS key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(str(path)).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden, ServiceUnavailable),\n        )\n\n\n
[docs]class GCSPickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": GCSPickleIOManager(\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n "gcs": GCSResource(project="my-cool-project")\n }\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @job(\n resource_defs={\n "io_manager": GCSPickleIOManager(\n gcs=GCSResource(project="my-cool-project")\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n }\n )\n def my_job():\n ...\n """\n\n gcs: ResourceDependency[GCSResource]\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectGCSIOManager:\n return PickledObjectGCSIOManager(\n bucket=self.gcs_bucket, client=self.gcs.get_client(), prefix=self.gcs_prefix\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectGCSIOManager(GCSPickleIOManager):\n """Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=GCSPickleIOManager.to_config_schema(),\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @job(\n resource_defs={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n def my_job():\n ...\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n bucket=init_context.resource_config["gcs_bucket"],\n client=client,\n prefix=init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom typing import Any, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import storage\nfrom pydantic import Field\n\nfrom .file_manager import GCSFileManager\n\n\n
[docs]class GCSResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google Cloud Storage.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(gcs: GCSResource):\n with gcs.get_client() as client:\n # client is a google.cloud.storage.Client\n ...\n """\n\n project: Optional[str] = Field(default=None, description="Project name")\n\n def get_client(self) -> storage.Client:\n """Creates a GCS Client.\n\n Returns: google.cloud.storage.Client\n """\n return _gcs_client_from_config(project=self.project)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GCSResource.to_config_schema(),\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context) -> storage.Client:\n return GCSResource.from_resource_context(init_context).get_client()
\n\n\n
[docs]class GCSFileManagerResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """FileManager that provides abstract access to GCS."""\n\n project: Optional[str] = Field(default=None, description="Project name")\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n def get_client(self) -> GCSFileManager:\n """Creates a :py:class:`~dagster_gcp.GCSFileManager` object that implements the\n :py:class:`~dagster._core.storage.file_manager.FileManager` API .\n\n Returns: GCSFileManager\n """\n gcs_client = _gcs_client_from_config(project=self.project)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=self.gcs_bucket,\n gcs_base_key=self.gcs_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=GCSFileManagerResource.to_config_schema())\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return GCSFileManagerResource.from_resource_context(context).get_client()
\n\n\ndef _gcs_client_from_config(project: Optional[str]) -> storage.Client:\n """Creates a GCS Client.\n\n Args:\n project: The GCP project\n\n Returns: A GCS client.\n """\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_gcp_pandas": {"bigquery": {"bigquery_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp.bigquery.io_manager import (\n    BigQueryClient,\n    BigQueryIOManager,\n    build_bigquery_io_manager,\n)\n\n\n
[docs]class BigQueryPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in BigQuery."""\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n\n job = connection.load_table_from_dataframe(\n dataframe=with_uppercase_cols,\n destination=f"{table_slice.schema}.{table_slice.table}",\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n )\n job.result()\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = connection.query(\n query=BigQueryClient.get_select_statement(table_slice),\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n ).to_dataframe()\n\n result.columns = map(str.lower, result.columns)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nbigquery_pandas_io_manager = build_bigquery_io_manager(\n [BigQueryPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nbigquery_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pandas import bigquery_pandas_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pandas_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPandasIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pandas import BigQueryPandasIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pandas/bigquery/bigquery_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler"}}}, "dagster_gcp_pyspark": {"bigquery": {"bigquery_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler

\nfrom typing import Any, Mapping, Optional, Sequence, Type\n\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp import BigQueryIOManager, build_bigquery_io_manager\nfrom dagster_gcp.bigquery.io_manager import BigQueryClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef _get_bigquery_write_options(\n    config: Optional[Mapping[str, Any]], table_slice: TableSlice\n) -> Mapping[str, str]:\n    conf = {\n        "table": f"{table_slice.database}.{table_slice.schema}.{table_slice.table}",\n    }\n    if config and config.get("temporary_gcs_bucket") is not None:\n        conf["temporaryGcsBucket"] = config["temporary_gcs_bucket"]\n    else:\n        conf["writeMethod"] = "direct"\n    return conf\n\n\ndef _get_bigquery_read_options(table_slice: TableSlice) -> Mapping[str, str]:\n    conf = {"viewsEnabled": "true", "materializationDataset": table_slice.schema}\n    return conf\n\n\n
[docs]class BigQueryPySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_bigquery_write_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format("bigquery").options(**options).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_bigquery_read_options(table_slice)\n spark = SparkSession.builder.getOrCreate() # type: ignore\n\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format("bigquery")\n .options(**options)\n .load(BigQueryClient.get_select_statement(table_slice))\n )\n\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nbigquery_pyspark_io_manager = build_bigquery_io_manager(\n [BigQueryPySparkTypeHandler()], default_load_type=DataFrame\n)\nbigquery_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pyspark import bigquery_pyspark_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pyspark_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPySparkIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pyspark import BigQueryPySparkIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pyspark/bigquery/bigquery_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ge.factory

\nimport datetime\nfrom typing import Any, Dict\n\nimport great_expectations as ge\nfrom dagster import (\n    ConfigurableResource,\n    ExpectationResult,\n    IAttachDifferentObjectToOpContext,\n    In,\n    MetadataValue,\n    OpExecutionContext,\n    Out,\n    Output,\n    _check as check,\n    op,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\nfrom pydantic import Field\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\nclass GEContextResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n    ge_root_dir: str = Field(\n        default=None,\n        description="The root directory for your Great Expectations project.",\n    )\n\n    def get_data_context(self):\n        if self.ge_root_dir is None:\n            return ge.data_context.DataContext()\n        return ge.data_context.DataContext(context_root_dir=self.ge_root_dir)\n\n    def get_object_to_set_on_execution_context(self):\n        return self.get_data_context()\n\n\n@dagster_maintained_resource\n@resource(config_schema=GEContextResource.to_config_schema())\ndef ge_data_context(context):\n    return GEContextResource.from_resource_context(context).get_data_context()\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(suite_name, "suite_name")\n check.opt_str_param(validation_operator_name, "validation_operator_name")\n batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an expectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n if validation_operator_name is not None:\n validation_operator = validation_operator_name\n else:\n data_context.add_validation_operator(\n "ephemeral_validation",\n {"class_name": "ActionListValidationOperator", "action_list": []},\n )\n validation_operator = "ephemeral_validation"\n suite = data_context.get_expectation_suite(suite_name)\n final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n if "datasource" in final_batch_kwargs:\n context.log.warning(\n "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n "parameter of the op factory instead."\n )\n final_batch_kwargs["datasource"] = datasource_name\n batch = data_context.get_batch(final_batch_kwargs, suite)\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = data_context.run_validation_operator(\n validation_operator, assets_to_validate=[batch], run_id=run_id\n )\n res = convert_to_json_serializable(results.list_validation_results())[0]\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = (\n validation_results_page_renderer.render_validation_operator_result(results)\n )\n md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=res["success"],\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(res)\n\n return _ge_validation_fn
\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API).\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the op input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n _extra_kwargs: Dict[Any, Any] = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **_extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\nfrom typing import Optional\n\nimport jwt\nimport requests\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubClient:\n    def __init__(\n        self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None\n    ) -> None:\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            (\n                "https://api.github.com/app/installations"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/v3/app/installations"\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            (\n                f"https://api.github.com/app/installations/{installation_id}/access_tokens"\n                if self.hostname is None\n                else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                    self.hostname, installation_id\n                )\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            (\n                "https://api.github.com/graphql"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/graphql"\n            ),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]class GithubResource(ConfigurableResource):\n github_app_id: int = Field(\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n )\n github_app_private_rsa_key: str = Field(\n description=(\n "Github Application Private RSA key text, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_installation_id: Optional[int] = Field(\n default=None,\n description=(\n "Github Application Installation ID, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_hostname: Optional[str] = Field(\n default=None,\n description=(\n "Github hostname. Defaults to `api.github.com`, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> GithubClient:\n return GithubClient(\n client=requests.Session(),\n app_id=self.github_app_id,\n app_private_rsa_key=self.github_app_private_rsa_key,\n default_installation_id=self.github_installation_id,\n hostname=self.github_hostname,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GithubResource.to_config_schema(),\n description="This resource is for connecting to Github",\n)\ndef github_resource(context) -> GithubClient:\n return GithubResource(**context.resource_config).get_client()
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nimport requests.exceptions\nfrom dagster import DagsterRunStatus\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.definitions.utils import validate_tags\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n    TERMINATE_RUN_JOB_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    JobInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]class DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL.\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagster.YOUR_ORG_HERE`.\n port_number (Optional[int]): Port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n timeout (int): Number of seconds before requests should time out. Defaults to 60.\n headers (Optional[Dict[str, str]]): Additional headers to include in the request. To use\n this client in Dagster Cloud, set the "Dagster-Cloud-Api-Token" header to a user token\n generated in the Dagster Cloud UI.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n timeout: int = 300,\n headers: Optional[Dict[str, str]] = None,\n ):\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(\n url=self._url, use_json=True, timeout=timeout, headers=headers\n ),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables"\n f" \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, job_name: str) -> List[JobInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[JobInfo] = chain(*map(JobInfo.from_node, query_res["nodes"]))\n return [info for info in valid_nodes if info.job_name == job_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Union[RunConfig, Mapping[str, Any]]] = None,\n mode: str = "default",\n preset: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n job_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(job_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name"\n f" `{pipeline_name}` exist",\n )\n elif len(job_info_lst) == 1:\n job_info = job_info_lst[0]\n repository_location_name = job_info.repository_location_name\n repository_name = job_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name since there are"\n f" multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the"\n f" name {pipeline_name}.\\n\\tchoose one of: {job_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": op_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": (\n {"tags": [{"key": k, "value": v} for k, v in tags.items()]} if tags else {}\n ),\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] @public\n def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[Sequence[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n op_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] @public\n def get_run_status(self, run_id: str) -> DagsterRunStatus:\n """Get the status of a given Pipeline Run.\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n DagsterRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return DagsterRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] @public\n def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] @deprecated(breaking_version="2.0")\n @public\n def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n\n def terminate_run(self, run_id: str):\n """Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\n based on a external event.\n\n Args:\n run_id (str): The run id of the pipeline run to terminate\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n TERMINATE_RUN_JOB_MUTATION, {"runId": run_id}\n )\n\n query_result: Dict[str, Any] = res_data["terminateRun"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "TerminateRunSuccess":\n return\n\n elif query_result_type == "RunNotFoundError":\n raise DagsterGraphQLClientError("RunNotFoundError", f"Run Id {run_id} not found")\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location.\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass JobInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n job_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["JobInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n JobInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n job_name=job["name"],\n )\n for job in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.executor

\nfrom typing import Iterator, List, Optional, cast\n\nimport kubernetes.config\nfrom dagster import (\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    executor,\n)\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import (\n    CheckStepHealthResult,\n    StepDelegatingExecutor,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    USER_DEFINED_K8S_CONFIG_SCHEMA,\n    DagsterK8sJobConfig,\n    UserDefinedDagsterK8sConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\n_K8S_EXECUTOR_CONFIG_SCHEMA = merge_dicts(\n    DagsterK8sJobConfig.config_type_job(),\n    {\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            description="""Whether or not the executor is running within a k8s cluster already. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.\n            If ``True``, we assume the executor is running within the target cluster and load config\n            using ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n            specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n            back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            description="""Path to a kubeconfig file to use, if not using default kubeconfig. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.""",\n        ),\n        "job_namespace": Field(StringSource, is_required=False),\n        "retries": get_retries_config(),\n        "max_concurrent": Field(\n            IntSource,\n            is_required=False,\n            description=(\n                "Limit on the number of pods that will run concurrently within the scope "\n                "of a Dagster run. Note that this limit is per run, not global."\n            ),\n        ),\n        "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n        "step_k8s_config": Field(\n            USER_DEFINED_K8S_CONFIG_SCHEMA,\n            is_required=False,\n            description="Raw Kubernetes configuration for each step launched by the executor.",\n        ),\n    },\n)\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=_K8S_EXECUTOR_CONFIG_SCHEMA,\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n max_concurrent: ...\n\n `max_concurrent` limits the number of pods that will execute concurrently for one run. By default\n there is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\n global limit.\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n\n Configuration set using `tags` on a `@job` will only apply to the `run` level. For configuration\n to apply at each `step` it must be set using `tags` for each `@op`.\n """\n run_launcher = (\n init_context.instance.run_launcher\n if isinstance(init_context.instance.run_launcher, K8sRunLauncher)\n else None\n )\n\n exc_cfg = init_context.executor_config\n\n k8s_container_context = K8sContainerContext(\n image_pull_policy=exc_cfg.get("image_pull_policy"), # type: ignore\n image_pull_secrets=exc_cfg.get("image_pull_secrets"), # type: ignore\n service_account_name=exc_cfg.get("service_account_name"), # type: ignore\n env_config_maps=exc_cfg.get("env_config_maps"), # type: ignore\n env_secrets=exc_cfg.get("env_secrets"), # type: ignore\n env_vars=exc_cfg.get("env_vars"), # type: ignore\n volume_mounts=exc_cfg.get("volume_mounts"), # type: ignore\n volumes=exc_cfg.get("volumes"), # type: ignore\n labels=exc_cfg.get("labels"), # type: ignore\n namespace=exc_cfg.get("job_namespace"), # type: ignore\n resources=exc_cfg.get("resources"), # type: ignore\n scheduler_name=exc_cfg.get("scheduler_name"), # type: ignore\n # step_k8s_config feeds into the run_k8s_config field because it is merged\n # with any configuration for the run that was set on the run launcher or code location\n run_k8s_config=UserDefinedDagsterK8sConfig.from_dict(exc_cfg.get("step_k8s_config", {})),\n )\n\n if "load_incluster_config" in exc_cfg:\n load_incluster_config = cast(bool, exc_cfg["load_incluster_config"])\n else:\n load_incluster_config = run_launcher.load_incluster_config if run_launcher else True\n\n if "kubeconfig_file" in exc_cfg:\n kubeconfig_file = cast(Optional[str], exc_cfg["kubeconfig_file"])\n else:\n kubeconfig_file = run_launcher.kubeconfig_file if run_launcher else None\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n image=exc_cfg.get("job_image"), # type: ignore\n container_context=k8s_container_context,\n load_incluster_config=load_incluster_config,\n kubeconfig_file=kubeconfig_file,\n ),\n retries=RetryMode.from_config(exc_cfg["retries"]), # type: ignore\n max_concurrent=check.opt_int_elem(exc_cfg, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(exc_cfg, "tag_concurrency_limits"),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n image: Optional[str],\n container_context: K8sContainerContext,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._executor_image = check.opt_str_param(image, "image")\n self._executor_container_context = check.inst_param(\n container_context, "container_context", K8sContainerContext\n )\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n def _get_step_key(self, step_handler_context: StepHandlerContext) -> str:\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n return step_keys_to_execute[0]\n\n def _get_container_context(\n self, step_handler_context: StepHandlerContext\n ) -> K8sContainerContext:\n step_key = self._get_step_key(step_handler_context)\n\n context = K8sContainerContext.create_for_run(\n step_handler_context.dagster_run,\n cast(K8sRunLauncher, step_handler_context.instance.run_launcher),\n include_run_tags=False, # For now don't include job-level dagster-k8s/config tags in step pods\n )\n context = context.merge(self._executor_container_context)\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n step_handler_context.step_tags[step_key]\n )\n return context.merge(K8sContainerContext(run_k8s_config=user_defined_k8s_config))\n\n def _get_k8s_step_job_name(self, step_handler_context: StepHandlerContext):\n step_key = self._get_step_key(step_handler_context)\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n container_context = self._get_container_context(step_handler_context)\n\n job_config = container_context.get_k8s_job_config(\n self._executor_image, step_handler_context.instance.run_launcher\n )\n\n args = step_handler_context.execute_step_args.get_command_args(\n skip_serialized_namedtuple=True\n )\n\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.job_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n run = step_handler_context.dagster_run\n labels = {\n "dagster/job": run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=container_context.run_k8s_config,\n labels=labels,\n env_vars=[\n *step_handler_context.execute_step_args.get_command_env(),\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n *container_context.env,\n ],\n )\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message=f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n metadata={\n "Kubernetes Job name": MetadataValue.text(job_name),\n },\n )\n\n namespace = check.not_none(container_context.namespace)\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n container_context = self._get_container_context(step_handler_context)\n\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n if status.failed:\n return CheckStepHealthResult.unhealthy(\n reason=f"Discovered failed Kubernetes job {job_name} for step {step_key}.",\n )\n\n return CheckStepHealthResult.healthy()\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n container_context = self._get_container_context(step_handler_context)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Deleting Kubernetes job {job_name} for step",\n event_specific_data=EngineEventData(),\n )\n\n self._api_client.delete_job(job_name=job_name, namespace=container_context.namespace)\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.launcher

\nimport logging\nimport sys\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport kubernetes\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._cli.api import ExecuteRunArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._grpc.types import ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_job_name_from_run_id\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data: Optional[ConfigurableClassData] = None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n resources=None,\n scheduler_name=None,\n security_context=None,\n run_k8s_config=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels: Mapping[str, str] = check.opt_mapping_param(\n labels, "labels", key_type=str, value_type=str\n )\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self._resources: Mapping[str, Any] = check.opt_mapping_param(resources, "resources")\n self._scheduler_name = check.opt_str_param(scheduler_name, "scheduler_name")\n self._security_context = check.opt_dict_param(security_context, "security_context")\n self._run_k8s_config = check.opt_dict_param(run_k8s_config, "run_k8s_config")\n super().__init__()\n\n @property\n def job_image(self):\n return self._job_image\n\n @property\n def image_pull_policy(self) -> str:\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self) -> Sequence[Mapping]:\n return self._image_pull_secrets\n\n @property\n def service_account_name(self) -> str:\n return self._service_account_name\n\n @property\n def env_config_maps(self) -> Sequence[str]:\n return self._env_config_maps\n\n @property\n def env_secrets(self) -> Sequence[str]:\n return self._env_secrets\n\n @property\n def volume_mounts(self) -> Sequence:\n return self._volume_mounts\n\n @property\n def volumes(self) -> Sequence:\n return self._volumes\n\n @property\n def resources(self) -> Mapping:\n return self._resources\n\n @property\n def scheduler_name(self) -> Optional[str]:\n return self._scheduler_name\n\n @property\n def security_context(self) -> Mapping[str, Any]:\n return self._security_context\n\n @property\n def env_vars(self) -> Sequence[str]:\n return self._env_vars\n\n @property\n def labels(self) -> Mapping[str, str]:\n return self._labels\n\n @property\n def run_k8s_config(self) -> Mapping[str, str]:\n return self._run_k8s_config\n\n @property\n def fail_pod_on_run_failure(self) -> Optional[bool]:\n return self._fail_pod_on_run_failure\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n return DagsterK8sJobConfig.config_type_run_launcher()\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_container_context_for_run(self, dagster_run: DagsterRun) -> K8sContainerContext:\n return K8sContainerContext.create_for_run(dagster_run, self, include_run_tags=True)\n\n def _launch_k8s_job_with_args(\n self, job_name: str, args: Optional[Sequence[str]], run: DagsterRun\n ) -> None:\n container_context = self.get_container_context_for_run(run)\n\n pod_name = job_name\n\n job_origin = check.not_none(run.job_code_origin)\n user_defined_k8s_config = container_context.run_k8s_config\n repository_origin = job_origin.repository_origin\n\n job_config = container_context.get_k8s_job_config(\n job_image=repository_origin.container_image, run_launcher=self\n )\n job_image = job_config.job_image\n if job_image: # expected to be set\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_image},\n )\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": job_origin.job_name,\n },\n *container_context.env,\n ],\n )\n\n namespace = check.not_none(container_context.namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(run.run_id)\n job_origin = check.not_none(run.job_code_origin)\n\n args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n job_origin = check.not_none(run.job_code_origin)\n\n args = ResumeRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container_context = self.get_container_context_for_run(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=container_context.namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def supports_run_worker_crash_recovery(self):\n return True\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n container_context = self.get_container_context_for_run(run)\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n namespace = container_context.namespace\n user_defined_k8s_config = container_context.run_k8s_config\n container_name = user_defined_k8s_config.container_config.get("name", "dagster")\n pod_names = self._api_client.get_pod_names_in_job(job_name, namespace=namespace)\n full_msg = ""\n try:\n pod_debug_info = [\n self._api_client.get_pod_debug_info(\n pod_name, namespace, container_name=container_name\n )\n for pod_name in pod_names\n ]\n full_msg = "\\n".join(pod_debug_info)\n except Exception:\n logging.exception(\n f"Error trying to get debug information for failed k8s job {job_name}"\n )\n if pod_names:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe pod"\n f" {pod_names[0]}`, `kubectl logs {pod_names[0]}`, or `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n else:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n return full_msg\n\n def check_run_worker_health(self, run: DagsterRun):\n container_context = self.get_container_context_for_run(run)\n\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n try:\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n\n inactive_job_with_finished_pods = bool(\n (not status.active) and (status.failed or status.succeeded)\n )\n\n # If the run is in a non-terminal (and non-STARTING) state but the k8s job is not active,\n # something went wrong\n if (\n run.status in (DagsterRunStatus.STARTED, DagsterRunStatus.CANCELING)\n and inactive_job_with_finished_pods\n ):\n return CheckRunHealthResult(\n WorkerStatus.FAILED, "Run has not completed but K8s job has no active pods"\n )\n\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.launcher"}, "ops": {"k8s_job_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.ops.k8s_job_op

\nimport time\nfrom typing import Any, Dict, List, Optional\n\nimport kubernetes.config\nimport kubernetes.watch\nfrom dagster import (\n    Enum as DagsterEnum,\n    Field,\n    In,\n    Noneable,\n    Nothing,\n    OpExecutionContext,\n    Permissive,\n    StringSource,\n    op,\n)\nfrom dagster._annotations import experimental\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..client import DEFAULT_JOB_POD_COUNT, DagsterKubernetesClient\nfrom ..container_context import K8sContainerContext\nfrom ..job import (\n    DagsterK8sJobConfig,\n    K8sConfigMergeBehavior,\n    UserDefinedDagsterK8sConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n)\nfrom ..launcher import K8sRunLauncher\n\nK8S_JOB_OP_CONFIG = merge_dicts(\n    DagsterK8sJobConfig.config_type_container(),\n    {\n        "image": Field(\n            StringSource,\n            is_required=True,\n            description="The image in which to launch the k8s job.",\n        ),\n        "command": Field(\n            [str],\n            is_required=False,\n            description="The command to run in the container within the launched k8s job.",\n        ),\n        "args": Field(\n            [str],\n            is_required=False,\n            description="The args for the command for the container.",\n        ),\n        "namespace": Field(StringSource, is_required=False),\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            default_value=True,\n            description="""Set this value if you are running the launcher\n            within a k8s cluster. If ``True``, we assume the launcher is running within the target\n            cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n            we will use the k8s config specified in ``kubeconfig_file`` (using\n            ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            default_value=None,\n            description=(\n                "The kubeconfig file from which to load config. Defaults to using the default"\n                " kubeconfig."\n            ),\n        ),\n        "timeout": Field(\n            int,\n            is_required=False,\n            description="How long to wait for the job to succeed before raising an exception",\n        ),\n        "container_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's main container"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_template_spec_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's pod spec"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's job spec"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "merge_behavior": Field(\n            DagsterEnum.from_python_enum(K8sConfigMergeBehavior),\n            is_required=False,\n            default_value=K8sConfigMergeBehavior.SHALLOW.value,\n            description=(\n                "How raw k8s config set on this op should be merged with any raw k8s config set on"\n                " the code location that launched the op. By default, the value is SHALLOW, meaning"\n                " that the two dictionaries are shallowly merged - any shared values in the "\n                " dictionaries will be replaced by the values set on this op. Setting it to DEEP"\n                " will recursively merge the two dictionaries, appending list fields together and"\n                " merging dictionary fields."\n            ),\n        ),\n    },\n)\n\n\n
[docs]@experimental\ndef execute_k8s_job(\n context: OpExecutionContext,\n image: str,\n command: Optional[List[str]] = None,\n args: Optional[List[str]] = None,\n namespace: Optional[str] = None,\n image_pull_policy: Optional[str] = None,\n image_pull_secrets: Optional[List[Dict[str, str]]] = None,\n service_account_name: Optional[str] = None,\n env_config_maps: Optional[List[str]] = None,\n env_secrets: Optional[List[str]] = None,\n env_vars: Optional[List[str]] = None,\n volume_mounts: Optional[List[Dict[str, Any]]] = None,\n volumes: Optional[List[Dict[str, Any]]] = None,\n labels: Optional[Dict[str, str]] = None,\n resources: Optional[Dict[str, Any]] = None,\n scheduler_name: Optional[str] = None,\n load_incluster_config: bool = True,\n kubeconfig_file: Optional[str] = None,\n timeout: Optional[int] = None,\n container_config: Optional[Dict[str, Any]] = None,\n pod_template_spec_metadata: Optional[Dict[str, Any]] = None,\n pod_spec_config: Optional[Dict[str, Any]] = None,\n job_metadata: Optional[Dict[str, Any]] = None,\n job_spec_config: Optional[Dict[str, Any]] = None,\n k8s_job_name: Optional[str] = None,\n merge_behavior: K8sConfigMergeBehavior = K8sConfigMergeBehavior.SHALLOW,\n):\n """This function is a utility for executing a Kubernetes job from within a Dagster op.\n\n Args:\n image (str): The image in which to launch the k8s job.\n command (Optional[List[str]]): The command to run in the container within the launched\n k8s job. Default: None.\n args (Optional[List[str]]): The args for the command for the container. Default: None.\n namespace (Optional[str]): Override the kubernetes namespace in which to run the k8s job.\n Default: None.\n image_pull_policy (Optional[str]): Allows the image pull policy to be overridden, e.g. to\n facilitate local testing with `kind <https://kind.sigs.k8s.io/>`_. Default:\n ``"Always"``. See:\n https://kubernetes.io/docs/concepts/containers/images/#updating-images.\n image_pull_secrets (Optional[List[Dict[str, str]]]): Optionally, a list of dicts, each of\n which corresponds to a Kubernetes ``LocalObjectReference`` (e.g.,\n ``{'name': 'myRegistryName'}``). This allows you to specify the ```imagePullSecrets`` on\n a pod basis. Typically, these will be provided through the service account, when needed,\n and you will not need to pass this argument. See:\n https://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\n and https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core\n service_account_name (Optional[str]): The name of the Kubernetes service account under which\n to run the Job. Defaults to "default" env_config_maps (Optional[List[str]]): A list of custom ConfigMapEnvSource names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container\n env_secrets (Optional[List[str]]): A list of custom Secret names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n env_vars (Optional[List[str]]): A list of environment variables to inject into the Job.\n Default: ``[]``. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n volume_mounts (Optional[List[Permissive]]): A list of volume mounts to include in the job's\n container. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core\n volumes (Optional[List[Permissive]]): A list of volumes to include in the Job's Pod. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core\n labels (Optional[Dict[str, str]]): Additional labels that should be included in the Job's Pod. See:\n https://kubernetes.io/docs/concepts/overview/working-with-objects/labels\n resources (Optional[Dict[str, Any]]) Compute resource requirements for the container. See:\n https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/\n scheduler_name (Optional[str]): Use a custom Kubernetes scheduler for launched Pods. See:\n https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/\n load_incluster_config (bool): Whether the op is running within a k8s cluster. If ``True``,\n we assume the launcher is running within the target cluster and load config using\n ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n back to the default kubeconfig. Default: True,\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n using the default kubeconfig. Default: None.\n timeout (Optional[int]): Raise an exception if the op takes longer than this timeout in\n seconds to execute. Default: None.\n container_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's main container\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core).\n Keys can either snake_case or camelCase.Default: None.\n pod_template_spec_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's\n metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n pod_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase. Default: None.\n job_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n job_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's job spec\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch).\n Keys can either snake_case or camelCase.Default: None.\n k8s_job_name (Optional[str]): Overrides the name of the the k8s job. If not set, will be set\n to a unique name based on the current run ID and the name of the calling op. If set,\n make sure that the passed in name is a valid Kubernetes job name that does not\n already exist in the cluster.\n merge_behavior (Optional[K8sConfigMergeBehavior]): How raw k8s config set on this op should\n be merged with any raw k8s config set on the code location that launched the op. By\n default, the value is K8sConfigMergeBehavior.SHALLOW, meaning that the two dictionaries\n are shallowly merged - any shared values in the dictionaries will be replaced by the\n values set on this op. Setting it to DEEP will recursively merge the two dictionaries,\n appending list fields together andmerging dictionary fields.\n """\n run_container_context = K8sContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, K8sRunLauncher)\n else None\n ),\n include_run_tags=False,\n )\n\n container_config = container_config.copy() if container_config else {}\n if command:\n container_config["command"] = command\n\n op_container_context = K8sContainerContext(\n image_pull_policy=image_pull_policy,\n image_pull_secrets=image_pull_secrets,\n service_account_name=service_account_name,\n env_config_maps=env_config_maps,\n env_secrets=env_secrets,\n env_vars=env_vars,\n volume_mounts=volume_mounts,\n volumes=volumes,\n labels=labels,\n namespace=namespace,\n resources=resources,\n scheduler_name=scheduler_name,\n run_k8s_config=UserDefinedDagsterK8sConfig.from_dict(\n {\n "container_config": container_config,\n "pod_template_spec_metadata": pod_template_spec_metadata,\n "pod_spec_config": pod_spec_config,\n "job_metadata": job_metadata,\n "job_spec_config": job_spec_config,\n "merge_behavior": merge_behavior.value,\n }\n ),\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n namespace = container_context.namespace\n\n user_defined_k8s_config = container_context.run_k8s_config\n\n k8s_job_config = DagsterK8sJobConfig(\n job_image=image,\n dagster_home=None,\n image_pull_policy=container_context.image_pull_policy,\n image_pull_secrets=container_context.image_pull_secrets,\n service_account_name=container_context.service_account_name,\n instance_config_map=None,\n postgres_password_secret=None,\n env_config_maps=container_context.env_config_maps,\n env_secrets=container_context.env_secrets,\n env_vars=container_context.env_vars,\n volume_mounts=container_context.volume_mounts,\n volumes=container_context.volumes,\n labels=container_context.labels,\n resources=container_context.resources,\n )\n\n job_name = k8s_job_name or get_k8s_job_name(\n context.run_id, context.get_step_execution_context().step.key\n )\n\n retry_number = context.retry_number\n if retry_number > 0:\n job_name = f"{job_name}-{retry_number}"\n\n labels = {\n "dagster/job": context.dagster_run.job_name,\n "dagster/op": context.op.name,\n "dagster/run-id": context.dagster_run.run_id,\n }\n if context.dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n context.dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=k8s_job_config,\n args=args,\n job_name=job_name,\n pod_name=job_name,\n component="k8s_job_op",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n )\n\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n # changing this to be able to be passed in will allow for unit testing\n api_client = DagsterKubernetesClient.production_client()\n\n context.log.info(f"Creating Kubernetes job {job_name} in namespace {namespace}...")\n\n start_time = time.time()\n\n api_client.batch_api.create_namespaced_job(namespace, job)\n\n context.log.info("Waiting for Kubernetes job to finish...")\n\n timeout = timeout or 0\n\n api_client.wait_for_job(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n restart_policy = user_defined_k8s_config.pod_spec_config.get("restart_policy", "Never")\n\n if restart_policy == "Never":\n container_name = container_config.get("name", "dagster")\n\n pods = api_client.wait_for_job_to_have_pods(\n job_name,\n namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n pod_names = [p.metadata.name for p in pods]\n\n if not pod_names:\n raise Exception("No pod names in job after it started")\n\n pod_to_watch = pod_names[0]\n watch = kubernetes.watch.Watch() # consider moving in to api_client\n\n api_client.wait_for_pod(\n pod_to_watch, namespace, wait_timeout=timeout, start_time=start_time\n )\n\n log_stream = watch.stream(\n api_client.core_api.read_namespaced_pod_log,\n name=pod_to_watch,\n namespace=namespace,\n container=container_name,\n )\n\n while True:\n if timeout and time.time() - start_time > timeout:\n watch.stop()\n raise Exception("Timed out waiting for pod to finish")\n\n try:\n log_entry = next(log_stream)\n print(log_entry) # noqa: T201\n except StopIteration:\n break\n else:\n context.log.info("Pod logs are disabled, because restart_policy is not Never")\n\n if job_spec_config and job_spec_config.get("parallelism"):\n num_pods_to_wait_for = job_spec_config["parallelism"]\n else:\n num_pods_to_wait_for = DEFAULT_JOB_POD_COUNT\n api_client.wait_for_running_job_to_succeed(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n num_pods_to_wait_for=num_pods_to_wait_for,\n )
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=K8S_JOB_OP_CONFIG)\n@experimental\ndef k8s_job_op(context):\n """An op that runs a Kubernetes job using the k8s API.\n\n Contrast with the `k8s_job_executor`, which runs each Dagster op in a Dagster job in its\n own k8s job.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in k8s.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_k8s_job_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_k8s_job` function\n inside your own op.\n\n The service account that is used to run this job should have the following RBAC permissions:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/kubernetes/k8s_job_op_rbac.yaml\n :language: YAML\n """\n if "merge_behavior" in context.op_config:\n merge_behavior = K8sConfigMergeBehavior(context.op_config.pop("merge_behavior"))\n else:\n merge_behavior = K8sConfigMergeBehavior.SHALLOW\n\n execute_k8s_job(context, merge_behavior=merge_behavior, **context.op_config)
\n
", "current_page_name": "_modules/dagster_k8s/ops/k8s_job_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.ops.k8s_job_op"}}, "pipes": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.pipes

\nimport random\nimport string\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Union\n\nimport kubernetes\nfrom dagster import (\n    OpExecutionContext,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.resource_annotation import ResourceParam\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.pipes.client import (\n    PipesClient,\n    PipesClientCompletedInvocation,\n    PipesContextInjector,\n    PipesMessageReader,\n    PipesParams,\n)\nfrom dagster._core.pipes.context import (\n    PipesMessageHandler,\n)\nfrom dagster._core.pipes.utils import (\n    PipesEnvContextInjector,\n    extract_message_or_forward_to_stdout,\n    open_pipes_session,\n)\nfrom dagster_pipes import (\n    PipesDefaultMessageWriter,\n    PipesExtras,\n)\n\nfrom dagster_k8s.utils import get_common_labels\n\nfrom .client import DagsterKubernetesClient, WaitForPodState\nfrom .models import k8s_model_from_dict, k8s_snake_case_dict\n\n\ndef get_pod_name(run_id: str, op_name: str):\n    clean_op_name = op_name.replace("_", "-")\n    suffix = "".join(random.choice(string.digits) for i in range(10))\n    return f"dagster-{run_id[:18]}-{clean_op_name[:20]}-{suffix}"\n\n\nDEFAULT_CONTAINER_NAME = "dagster-pipes-execution"\n\n\n
[docs]@experimental\nclass PipesK8sPodLogsMessageReader(PipesMessageReader):\n """Message reader that reads messages from kubernetes pod logs."""\n\n @contextmanager\n def read_messages(\n self,\n handler: PipesMessageHandler,\n ) -> Iterator[PipesParams]:\n self._handler = handler\n try:\n yield {PipesDefaultMessageWriter.STDIO_KEY: PipesDefaultMessageWriter.STDERR}\n finally:\n self._handler = None\n\n def consume_pod_logs(\n self,\n core_api: kubernetes.client.CoreV1Api,\n pod_name: str,\n namespace: str,\n ):\n handler = check.not_none(\n self._handler, "can only consume logs within scope of context manager"\n )\n for line in core_api.read_namespaced_pod_log(\n pod_name,\n namespace,\n follow=True,\n _preload_content=False, # avoid JSON processing\n ).stream():\n log_chunk = line.decode("utf-8")\n for log_line in log_chunk.split("\\n"):\n extract_message_or_forward_to_stdout(handler, log_line)\n\n def no_messages_debug_text(self) -> str:\n return "Attempted to read messages by extracting them from kubernetes pod logs directly."
\n\n\n@experimental\nclass _PipesK8sClient(PipesClient):\n """A pipes client for launching kubernetes pods.\n\n By default context is injected via environment variables and messages are parsed out of\n the pod logs, with other logs forwarded to stdout of the orchestration process.\n\n The first container within the containers list of the pod spec is expected (or set) to be\n the container prepared for pipes protocol communication.\n\n Args:\n env (Optional[Mapping[str, str]]): An optional dict of environment variables to pass to the\n subprocess.\n context_injector (Optional[PipesContextInjector]): A context injector to use to inject\n context into the k8s container process. Defaults to :py:class:`PipesEnvContextInjector`.\n message_reader (Optional[PipesMessageReader]): A message reader to use to read messages\n from the k8s container process. Defaults to :py:class:`PipesK8sPodLogsMessageReader`.\n """\n\n def __init__(\n self,\n env: Optional[Mapping[str, str]] = None,\n context_injector: Optional[PipesContextInjector] = None,\n message_reader: Optional[PipesMessageReader] = None,\n ):\n self.env = check.opt_mapping_param(env, "env", key_type=str, value_type=str)\n self.context_injector = (\n check.opt_inst_param(\n context_injector,\n "context_injector",\n PipesContextInjector,\n )\n or PipesEnvContextInjector()\n )\n\n self.message_reader = (\n check.opt_inst_param(message_reader, "message_reader", PipesMessageReader)\n or PipesK8sPodLogsMessageReader()\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def run(\n self,\n *,\n context: OpExecutionContext,\n extras: Optional[PipesExtras] = None,\n image: Optional[str] = None,\n command: Optional[Union[str, Sequence[str]]] = None,\n namespace: Optional[str] = None,\n env: Optional[Mapping[str, str]] = None,\n base_pod_meta: Optional[Mapping[str, Any]] = None,\n base_pod_spec: Optional[Mapping[str, Any]] = None,\n ) -> PipesClientCompletedInvocation:\n """Publish a kubernetes pod and wait for it to complete, enriched with the pipes protocol.\n\n Args:\n image (Optional[str]):\n The image to set the first container in the pod spec to use.\n command (Optional[Union[str, Sequence[str]]]):\n The command to set the first container in the pod spec to use.\n namespace (Optional[str]):\n Which kubernetes namespace to use, defaults to "default"\n env (Optional[Mapping[str,str]]):\n A mapping of environment variable names to values to set on the first\n container in the pod spec, on top of those configured on resource.\n base_pod_meta (Optional[Mapping[str, Any]]:\n Raw k8s config for the k8s pod's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)\n Keys can either snake_case or camelCase. The name value will be overridden.\n base_pod_spec (Optional[Mapping[str, Any]]:\n Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase.\n extras (Optional[PipesExtras]):\n Extra values to pass along as part of the ext protocol.\n context_injector (Optional[PipesContextInjector]):\n Override the default ext protocol context injection.\n message_reader (Optional[PipesMessageReader]):\n Override the default ext protocol message reader.\n\n Returns:\n PipesClientCompletedInvocation: Wrapper containing results reported by the external\n process.\n """\n client = DagsterKubernetesClient.production_client()\n\n with open_pipes_session(\n context=context,\n extras=extras,\n context_injector=self.context_injector,\n message_reader=self.message_reader,\n ) as pipes_session:\n namespace = namespace or "default"\n pod_name = get_pod_name(context.run_id, context.op.name)\n pod_body = build_pod_body(\n pod_name=pod_name,\n image=image,\n command=command,\n env_vars={\n **pipes_session.get_bootstrap_env_vars(),\n **(self.env or {}),\n **(env or {}),\n },\n base_pod_meta=base_pod_meta,\n base_pod_spec=base_pod_spec,\n )\n client.core_api.create_namespaced_pod(namespace, pod_body)\n try:\n # if were doing direct pod reading, wait for pod to start and then stream logs out\n if isinstance(self.message_reader, PipesK8sPodLogsMessageReader):\n client.wait_for_pod(\n pod_name,\n namespace,\n wait_for_state=WaitForPodState.Ready,\n )\n self.message_reader.consume_pod_logs(\n core_api=client.core_api,\n pod_name=pod_name,\n namespace=namespace,\n )\n else:\n # if were not doing direct log reading, just wait for pod to finish\n client.wait_for_pod(\n pod_name,\n namespace,\n wait_for_state=WaitForPodState.Terminated,\n )\n finally:\n client.core_api.delete_namespaced_pod(pod_name, namespace)\n return PipesClientCompletedInvocation(tuple(pipes_session.get_results()))\n\n\ndef build_pod_body(\n pod_name: str,\n image: Optional[str],\n command: Optional[Union[str, Sequence[str]]],\n env_vars: Mapping[str, str],\n base_pod_meta: Optional[Mapping[str, Any]],\n base_pod_spec: Optional[Mapping[str, Any]],\n):\n meta = {\n **(k8s_snake_case_dict(kubernetes.client.V1ObjectMeta, base_pod_meta or {})),\n "name": pod_name,\n }\n if "labels" in meta:\n meta["labels"] = {**get_common_labels(), **meta["labels"]}\n else:\n meta["labels"] = get_common_labels()\n\n spec = {**k8s_snake_case_dict(kubernetes.client.V1PodSpec, base_pod_spec or {})}\n if "containers" not in spec:\n spec["containers"] = [{}]\n\n if "restart_policy" not in spec:\n spec["restart_policy"] = "Never"\n elif spec["restart_policy"] == "Always":\n raise DagsterInvariantViolationError(\n "A restart policy of Always is not allowed, computations are expected to complete."\n )\n\n if "image" not in spec["containers"][0] and not image:\n raise DagsterInvariantViolationError(\n "Must specify image property or provide base_pod_spec with one set."\n )\n\n if "name" not in spec["containers"][0]:\n spec["containers"][0]["name"] = DEFAULT_CONTAINER_NAME\n\n if image:\n spec["containers"][0]["image"] = image\n\n if command:\n spec["containers"][0]["command"] = command\n\n if "env" not in spec["containers"][0]:\n spec["containers"][0]["env"] = []\n\n spec["containers"][0]["env"].extend({"name": k, "value": v} for k, v in env_vars.items())\n\n return k8s_model_from_dict(\n kubernetes.client.V1Pod,\n {\n "metadata": meta,\n "spec": spec,\n },\n )\n\n\nPipesK8sClient = ResourceParam[_PipesK8sClient]\n
", "current_page_name": "_modules/dagster_k8s/pipes", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.pipes"}}, "dagster_mlflow": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.hooks

\nfrom dagster._core.definitions.decorators.hook_decorator import event_list_hook\nfrom dagster._core.definitions.events import HookExecutionResult\nfrom mlflow.entities.run_status import RunStatus\n\n\ndef _create_mlflow_run_hook(name):\n    @event_list_hook(name=name, required_resource_keys={"mlflow"})\n    def _hook(context, event_list):\n        for event in event_list:\n            if event.is_step_success:\n                _cleanup_on_success(context)\n            elif event.is_step_failure:\n                mlf = context.resources.mlflow\n                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n        return HookExecutionResult(hook_name=name, is_skipped=False)\n\n    return _hook\n\n\ndef _cleanup_on_success(context):\n    """Checks if the current solid in the context is the last solid in the job\n    and ends the mlflow run with a successful status when this is the case.\n    """\n    last_solid_name = context._step_execution_context.job_def.nodes_in_topological_order[  # noqa: SLF001  # fmt: skip\n        -1\n    ].name\n\n    if context.op.name == last_solid_name:\n        context.resources.mlflow.end_run()\n\n\nend_mlflow_on_run_finished = _create_mlflow_run_hook("end_mlflow_on_run_finished")\n
", "current_page_name": "_modules/dagster_mlflow/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.resources

\n"""This module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom dagster import Field, Noneable, Permissive, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom mlflow.entities.run_status import RunStatus\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(StringSource, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(StringSource),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.dagster_run.job_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]\n\n    def _set_active_run(self, run_id=None):\n        """This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """Catches the Mlflow exception if a run is already active."""\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in the webserver fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your job to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_op(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in Dagster UI</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in webserver</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom dagster_msteams.client import TeamsClient\n\n\n
[docs]class MSTeamsResource(ConfigurableResource):\n """This resource is for connecting to Microsoft Teams.\n\n Provides a `dagster_msteams.TeamsClient` which can be used to\n interface with the MS Teams API.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster op,\n asset, schedule, or sensor:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job, Definitions, EnvVar\n from dagster_msteams import Card, MSTeamsResource\n\n\n @op\n def teams_op(msteams: MSTeamsResource):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n msteams.get_client().post_message(payload=card.payload)\n\n\n @job\n def teams_job():\n teams_op()\n\n defs = Definitions(\n jobs=[teams_job],\n resources={\n "msteams": MSTeamsResource(\n hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n )\n }\n )\n """\n\n hook_url: str = Field(\n default=None,\n description=(\n "To send messages to MS Teams channel, an incoming webhook has to be created. The"\n " incoming webhook url must be given as a part of the resource config to the"\n " MSTeamsResource in Dagster. For more information on how to create an incoming"\n " webhook, see"\n " https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook"\n ),\n )\n http_proxy: str = Field(default=None, description="HTTP proxy URL")\n https_proxy: str = Field(default=None, description="HTTPS proxy URL")\n timeout: float = Field(default=60, description="Timeout for requests to MS Teams")\n verify: bool = Field(\n default=True, description="Whether to verify SSL certificates, defaults to True"\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> TeamsClient:\n return TeamsClient(\n hook_url=self.hook_url,\n http_proxy=self.http_proxy,\n https_proxy=self.https_proxy,\n timeout=self.timeout,\n verify=self.verify,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=MSTeamsResource.to_config_schema(),\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context) -> TeamsClient:\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job\n from dagster_msteams import Card, msteams_resource\n\n\n @op(required_resource_keys={"msteams"})\n def teams_op(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @job(resource_defs={"msteams": msteams_resource})\n def teams_job():\n teams_op()\n\n\n teams_job.execute_in_process(\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n )\n """\n return MSTeamsResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster import DefaultSensorStatus\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n\n\ndef _default_failure_message(context: RunFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Job {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef make_teams_on_run_failure_sensor(\n hook_url: str,\n message_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on run failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None,\n which means the alert will be sent when any job in the repository matches the requested\n run_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n\n Examples:\n .. code-block:: python\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_job + teams_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return "Job {job_name} failed! Error: {error}".format(\n job_name=context.dagster_run.job_name,\n error=context.failure_event.message,\n )\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n webserver_base_url="http://localhost:3000",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @run_failure_sensor(\n name=name,\n default_status=default_status,\n monitored_jobs=monitored_jobs,\n monitor_all_repositories=monitor_all_repositories,\n )\n def teams_on_run_failure(context: RunFailureSensorContext):\n text = message_fn(context)\n if webserver_base_url:\n text += "<a href='{base_url}/runs/{run_id}'>View in Dagit</a>".format(\n base_url=webserver_base_url,\n run_id=context.dagster_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_run_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.event_log.event_log

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.exc as db_exc\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n self._mysql_version = self.get_server_version()\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLEventLogStorage":\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string: str) -> "MySQLEventLogStorage":\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def get_server_version(self) -> Optional[str]:\n with self.index_connection() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n # last_materialization_timestamp is updated upon observation, materialization, materialization_planned\n # See SqlEventLogStorage.store_asset_event method for more details\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n if values:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n **values,\n )\n .on_duplicate_key_update(\n **values,\n )\n )\n else:\n try:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n )\n )\n except db_exc.IntegrityError:\n pass\n\n def _connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n with self._connect() as conn:\n return table_name in db.inspect(conn).get_table_names()\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore # (possible none)\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n @property\n def event_watcher(self) -> SqlPollingEventWatcher:\n return self._event_watcher\n\n def __del__(self) -> None:\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nfrom typing import ContextManager, Mapping, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BUCKET_VERSION = "8.0.0"\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLRunStorage":\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLRunStorage":\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name: str) -> None:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n conn.execute(\n db_dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n insert_stmt = db_dialects.mysql.insert(KeyValueStoreTable).values(db_values)\n conn.execute(\n insert_stmt.on_duplicate_key_update(\n value=insert_stmt.inserted.value,\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BATCH_VERSION = "8.0.0"\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLScheduleStorage":\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLScheduleStorage":\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n @property\n def supports_batch_queries(self) -> bool:\n if not self._mysql_version:\n return False\n\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version(\n MINIMUM_MYSQL_BATCH_VERSION\n )\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.mysql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_duplicate_key_update(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pagerduty.resources

\nfrom typing import Dict, Optional, cast\n\nimport pypd\nfrom dagster import ConfigurableResource, resource\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.warnings import suppress_dagster_warnings\nfrom pydantic import Field as PyField\n\n\n
[docs]class PagerDutyService(ConfigurableResource):\n """This resource is for posting events to PagerDuty."""\n\n """Integrates with PagerDuty via the pypd library.\n\n See:\n https://v2.developer.pagerduty.com/docs/events-api-v2\n https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n https://github.com/PagerDuty/pagerduty-api-python-client\n\n for documentation and more information.\n """\n\n routing_key: str = PyField(\n ...,\n description=(\n "The routing key provisions access to your PagerDuty service. You"\n "will need to include the integration key for your new integration, as a"\n "routing_key in the event payload."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def EventV2_create(\n self,\n summary: str,\n source: str,\n severity: str,\n event_action: str = "trigger",\n dedup_key: Optional[str] = None,\n timestamp: Optional[str] = None,\n component: Optional[str] = None,\n group: Optional[str] = None,\n event_class: Optional[str] = None,\n custom_details: Optional[object] = None,\n ) -> object:\n """Events API v2 enables you to add PagerDuty's advanced event and incident management\n functionality to any system that can make an outbound HTTP connection.\n\n Args:\n summary (str):\n A high-level, text summary message of the event. Will be used to construct an\n alert's description. Example:\n\n "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n 'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n source (str):\n Specific human-readable unique identifier, such as a hostname, for the system having\n the problem. Examples:\n\n "prod05.theseus.acme-widgets.com"\n "171.26.23.22"\n "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n "9c09acd49a25"\n\n severity (str):\n How impacted the affected system is. Displayed to users in lists and influences the\n priority of any created incidents. Must be one of {info, warning, error, critical}\n\n Keyword Args:\n event_action (str):\n There are three types of events that PagerDuty recognizes, and are used to represent\n different types of activity in your monitored systems. (default: 'trigger')\n\n * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n or add a new trigger log entry to an existing alert, depending on the\n provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n when a new problem has been detected. You may send additional triggers\n when a previously detected problem has occurred again.\n\n * acknowledge: acknowledge events cause the referenced incident to enter the\n acknowledged state. While an incident is acknowledged, it won't\n generate any additional notifications, even if it receives new\n trigger events. Your monitoring tools should send PagerDuty an\n acknowledge event when they know someone is presently working on the\n problem.\n\n * resolve: resolve events cause the referenced incident to enter the resolved state.\n Once an incident is resolved, it won't generate any additional\n notifications. New trigger events with the same dedup_key as a resolved\n incident won't re-open the incident. Instead, a new incident will be\n created. Your monitoring tools should send PagerDuty a resolve event when\n the problem that caused the initial trigger event has been fixed.\n\n dedup_key (str):\n Deduplication key for correlating triggers and resolves. The maximum permitted\n length of this property is 255 characters.\n\n timestamp (str):\n Timestamp (ISO 8601). When the upstream system detected / created the event. This is\n useful if a system batches or holds events before sending them to PagerDuty. This\n will be auto-generated by PagerDuty if not provided. Example:\n\n 2015-07-17T08:42:58.315+0000\n\n component (str):\n The part or component of the affected system that is broken. Examples:\n\n "keepalive"\n "webping"\n "mysql"\n "wqueue"\n\n group (str):\n A cluster or grouping of sources. For example, sources "prod-datapipe-02" and\n "prod-datapipe-03" might both be part of "prod-datapipe". Examples:\n\n "prod-datapipe"\n "www"\n "web_stack"\n\n event_class (str):\n The class/type of the event. Examples:\n\n "High CPU"\n "Latency"\n "500 Error"\n\n custom_details (Dict[str, str]):\n Additional details about the event and affected system. Example:\n\n {"ping time": "1500ms", "load avg": 0.75 }\n """\n data = {\n "routing_key": self.routing_key,\n "event_action": event_action,\n "payload": {"summary": summary, "source": source, "severity": severity},\n }\n\n if dedup_key is not None:\n data["dedup_key"] = dedup_key\n\n payload: Dict[str, object] = cast(Dict[str, object], data["payload"])\n\n if timestamp is not None:\n payload["timestamp"] = timestamp\n\n if component is not None:\n payload["component"] = component\n\n if group is not None:\n payload["group"] = group\n\n if event_class is not None:\n payload["class"] = event_class\n\n if custom_details is not None:\n payload["custom_details"] = custom_details\n\n return pypd.EventV2.create(data=data)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=infer_schema_from_config_class(PagerDutyService),\n description="""This resource is for posting events to PagerDuty.""",\n)\n@suppress_dagster_warnings\ndef pagerduty_resource(context) -> PagerDutyService:\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n .. code-block:: python\n\n @op\n def pagerduty_op(pagerduty: PagerDutyService):\n pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom dagster import (\n    DagsterType,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom pandas import DataFrame\nfrom typing_extensions import Final\n\nCONSTRAINT_METADATA_KEY: Final = "constraint_metadata"\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return {\n            CONSTRAINT_METADATA_KEY: {\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        }\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata=self.convert_to_metadata()\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            f"Violated {constraint_name} - {constraint_description}"\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = (\n            'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'\n            .format(\n                constraint_name=self.constraint_name,\n                constraint_description=self.constraint_description,\n                column_name=self.column_name,\n            )\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\n@experimental\nclass ConstraintWithMetadata:\n    """This class defines a base constraint over pandas DFs with organized metadata.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description=f"A Pandas DataFrame with the following validation: {self.description}",\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """Use this class if you have multiple constraints to check over the entire dataframe.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = f"ensuring that the right columns, {self.column_list} were present"\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = f"No columns outside of {self.strict_column_list} allowed. "\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected the following ordering of columns {expected}. Received:"\n " {received}".format(\n expected=self.strict_column_list, received=columns_received\n )\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = f"Dataframe must have {self.num_allowed_rows} +- {self.error_tolerance} rows."\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n )\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if res[1].get("actual") is not None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """This class is useful for constructing single constraints that you want to apply to multiple\n columns of your dataframe.\n\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly, and also allows for cases like 'fail if any one of\n these constraints fails but still run all of them'.\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """Validates that a particular value in a column is not null.\n\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """Validates that all values in an iterable are unique.\n\n Returns duplicated values as metadata.\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """Decorator for column validation functions to make them error on nulls.\n\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls.\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """Factory for validators testing if column values are within a range.\n\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = f"checks whether values are between {minim} and {maxim}"\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """Factory for validators testing if all values are in some set.\n\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n f"checks whether values are within this set of values: {categories}"\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """Factory for testing if the dtype of a val falls within some allowed set.\n\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = f"checks whether values are this type/types: {datatypes}"\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = f"Confirms values are between {minim} and {maxim}"\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = f"Column dtype must be in the following set {self.expected_dtype_set}."\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n f"{self.error_description}. DTypes received: {received_dtypes}"\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are not null."""\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description=f"Expected Categories are {self.categories}",\n markdown_description=f"Category examples are {self.categories[:5]}...",\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description=f"values > {self.min_value}",\n error_description=f"Column must have values > {self.min_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description=f"values < {self.max_value}",\n error_description=f"Column must have values < {self.max_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description=f"{self.min_value} < values < {self.max_value}",\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataValue,\n    StringSource,\n    TableColumn,\n    TableSchema,\n    TableSchemaMetadataValue,\n    TypeCheck,\n    _check as check,\n    dagster_type_loader,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import Selector\nfrom dagster._core.definitions.metadata import normalize_metadata\nfrom dagster._utils import dict_without_keys\n\nfrom dagster_pandas.constraints import (\n    CONSTRAINT_METADATA_KEY,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = next(iter(config.items()))\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(f"Unsupported file_type {file_type}")\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata={\n            "row_count": str(len(value)),\n            # string cast columns since they may be things like datetime\n            "metadata": {"columns": list(map(str, value.columns))},\n        },\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    type_check_fn=df_type_check,\n    typing_type=pd.DataFrame,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + f"+ {constraint_description}\\n"\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = f"**{column_name}**"\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + f": `{dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]}`"\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + f": Validator `{constraint.type_fn.__name__}`"\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\ndef create_table_schema_metadata_from_dataframe(\n    pandas_df: pd.DataFrame,\n) -> TableSchemaMetadataValue:\n    """This function takes a pandas DataFrame and returns its metadata as a Dagster TableSchema.\n\n    Args:\n        pandas_df (pandas.DataFrame): A pandas DataFrame for which to create metadata.\n\n    Returns:\n        TableSchemaMetadataValue: returns an object with the TableSchema for the DataFrame.\n    """\n    check.inst(pandas_df, pd.DataFrame, "Input must be a pandas DataFrame object")\n    return MetadataValue.table_schema(\n        TableSchema(\n            columns=[\n                TableColumn(name=str(name), type=str(dtype))\n                for name, dtype in pandas_df.dtypes.items()\n            ]\n        )\n    )\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n):\n """Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]])\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values.\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n """\n # We allow for the plugging in of a dagster_type_loader so that users can load their custom\n # dataframes via configuration their own way if the default configs don't suffice. This is\n # purely optional.\n check.str_param(name, "name")\n metadata_fn = check.opt_callable_param(metadata_fn, "metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n\n try:\n validate_constraints(\n value,\n pandas_columns=columns,\n dataframe_constraints=dataframe_constraints,\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata=_execute_summary_stats(name, value, metadata_fn) if metadata_fn else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n typing_type=pd.DataFrame,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n):\n """Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = {}\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n metadata[f"{key}-constraint-metadata"] = MetadataValue.json(result_dict)\n constraint_clauses.append(f"{key} failing constraints, {result.description}")\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata=metadata,\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, metadata_fn):\n if not metadata_fn:\n return []\n\n user_metadata = metadata_fn(value)\n try:\n return normalize_metadata(user_metadata)\n except:\n raise DagsterInvariantViolationError(\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Dict[str, RawMetadataValue]."\n )\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.validation

\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n f"Required column {self.name} not in dataframe with columns {dataframe.columns}"\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.event_log.event_log

\nfrom typing import Any, ContextManager, Mapping, Optional, Sequence\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import pg_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    DynamicPartitionsTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.event_log.polling_event_watcher import SqlPollingEventWatcher\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, deserialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your event log storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n self._secondary_index_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, Any]\n ) -> "PostgresEventLogStorage":\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n conn_string: str, should_autocreate_tables: bool = True\n ) -> "PostgresEventLogStorage":\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n\n # LISTEN/NOTIFY no longer used for pg event watch - preserved here to support version skew\n conn.execute(\n db.text(f"""NOTIFY {CHANNEL_NAME}, :notify_id; """),\n {"notify_id": res[0] + "_" + str(res[1])}, # type: ignore\n )\n event_id = int(res[1]) # type: ignore\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # job, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n query = db_dialects.postgresql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n if values:\n query = query.on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(**values),\n )\n else:\n query = query.on_conflict_do_nothing()\n conn.execute(query)\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n if not partition_keys:\n return\n\n # Overload base implementation to push upsert logic down into the db layer\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n db_dialects.postgresql.insert(DynamicPartitionsTable)\n .values(\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in partition_keys\n ]\n )\n .on_conflict_do_nothing(),\n )\n\n def _connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n return bool(self._engine.dialect.has_table(self._engine.connect(), table_name))\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(\n self,\n run_id: str,\n cursor: Optional[str],\n callback: EventHandlerFn,\n ) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def _gen_event_log_entry_from_cursor(self, cursor) -> EventLogEntry:\n with self._engine.connect() as conn:\n cursor_res = conn.execute(\n db_select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == cursor\n ),\n )\n return deserialize_value(cursor_res.scalar(), EventLogEntry) # type: ignore\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self) -> None:\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport zlib\nfrom typing import ContextManager, Mapping, Optional\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable, SnapshotsTable\nfrom dagster._core.storage.runs.sql_run_storage import SnapshotType\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your run storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresRunStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name: str) -> bool:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db_dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_value(daemon_heartbeat),\n },\n )\n .returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n DaemonHeartbeatsTable.c.daemon_type,\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n\n # pg speciic on_conflict_do_update\n insert_stmt = db_dialects.postgresql.insert(KeyValueStoreTable).values(\n [{"key": k, "value": v} for k, v in pairs.items()]\n )\n upsert_stmt = insert_stmt.on_conflict_do_update(\n index_elements=[\n KeyValueStoreTable.c.key,\n ],\n set_={"value": insert_stmt.excluded.value},\n ).returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n KeyValueStoreTable.c.key\n )\n\n with self.connect() as conn:\n conn.execute(upsert_stmt)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n with self.connect() as conn:\n snapshot_insert = (\n db_dialects.postgresql.insert(SnapshotsTable)\n .values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n .on_conflict_do_nothing()\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.scheduler.instigation import InstigatorState\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your schedule storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ) -> "PostgresScheduleStorage":\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresScheduleStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.postgresql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_conflict_do_update(\n index_elements=[InstigatorsTable.c.selector_id],\n set_={\n "status": state.status.value,\n "instigator_type": state.instigator_type.value,\n "instigator_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n },\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom dagster import (\n    ConfigurableResource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom prometheus_client.exposition import default_handler\nfrom pydantic import Field, PrivateAttr\n\n\n
[docs]class PrometheusClient:\n """Integrates with Prometheus via the prometheus_client library."""
\n\n\n
[docs]class PrometheusResource(ConfigurableResource):\n """This resource is used to send metrics to a Prometheus Pushgateway.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster_prometheus import PrometheusResource\n from dagster import Definitions, job, op\n\n @op\n def example_prometheus_op(prometheus: PrometheusResource):\n prometheus.push_to_gateway(job="my_job")\n\n @job\n def my_job():\n example_prometheus_op()\n\n defs = Definitions(\n jobs=[my_job],\n resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n )\n\n """\n\n gateway: str = Field(\n description=(\n "The url for your push gateway. Either of the"\n " form 'http://pushgateway.local', or 'pushgateway.local'."\n " Scheme defaults to 'http' if none is provided"\n )\n )\n timeout: int = Field(\n default=30,\n description="is how long delete will attempt to connect before giving up. Defaults to 30s.",\n )\n _registry: prometheus_client.CollectorRegistry = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._registry = prometheus_client.CollectorRegistry()\n\n @property\n def registry(self) -> prometheus_client.CollectorRegistry:\n return self._registry\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method.\n """\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method.\n """\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method.\n """\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=PrometheusResource.to_config_schema(),\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pyspark.resources

\nfrom typing import Any, Dict\n\nimport dagster._check as check\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pydantic import PrivateAttr\nfrom pyspark.sql import SparkSession\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\n
[docs]class PySparkResource(ConfigurableResource):\n """This resource provides access to a PySpark Session for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(pyspark: PySparkResource)\n spark_session = pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n @job(\n resource_defs={\n "pyspark": PySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n return self.spark_session.sparkContext
\n\n\n
[docs]@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context) -> PySparkResource:\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return PySparkResource.from_resource_context(context_updated_config)
\n\n\nclass LazyPySparkResource(ConfigurableResource):\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(lazy_pyspark: LazyPySparkResource)\n spark_session = lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n @job(\n resource_defs={\n "lazy_pyspark": LazyPySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _init_session(self) -> None:\n if self._spark_session is None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n self._init_session()\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n self._init_session()\n return self._spark_session.sparkContext\n\n\n@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef lazy_pyspark_resource(init_context: InitResourceContext) -> LazyPySparkResource:\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"lazy_pyspark"})\n def my_op(context):\n spark_session = context.resources.lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = lazy_pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"lazy_pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return LazyPySparkResource.from_resource_context(context_updated_config)\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.ops

\nimport os\nfrom enum import Enum\nfrom typing import AbstractSet, Any, Dict, Mapping, Optional\n\nfrom dagster import (\n    Config,\n    Failure,\n    In,\n    Nothing,\n    OpExecutionContext,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom pydantic import Field\n\nfrom .utils import execute, execute_script_file\n\n\nclass OutputType(Enum):\n    STREAM = "STREAM"\n    """Stream script stdout/stderr."""\n\n    BUFFER = "BUFFER"\n    """Buffer shell script stdout/stderr, then log upon completion."""\n\n    NONE = "NONE"\n    """No logging."""\n\n\nclass ShellOpConfig(Config):\n    env: Optional[Dict[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n    output_logging: OutputType = Field(\n        OutputType.BUFFER.value,\n    )\n    cwd: Optional[str] = Field(\n        default=None, description="Working directory in which to execute shell script"\n    )\n\n    def to_execute_params(self) -> Dict[str, Any]:\n        return {\n            "env": {**os.environ, **(self.env or {})},\n            "output_logging": self.output_logging.value,\n            "cwd": self.cwd,\n        }\n\n\n
[docs]@op(\n name="shell_op",\n description=(\n "This op executes a shell command it receives as input.\\n\\n"\n "This op is suitable for uses where the command to execute is generated dynamically by "\n "upstream ops. If you know the command to execute at job construction time, "\n "consider `shell_command_op` instead."\n ),\n ins={"shell_command": In(str)},\n out=Out(str),\n)\ndef shell_op(context: OpExecutionContext, shell_command: str, config: ShellOpConfig) -> str:\n """This op executes a shell command it receives as input.\n This op is suitable for uses where the command to execute is generated dynamically by\n upstream ops. If you know the command to execute at job construction time,\n consider ``shell_command_op`` instead.\n\n Args:\n shell_command: The shell command to be executed\n config (ShellOpConfig): A ShellOpConfig object specifying configuration options\n\n Examples:\n .. code-block:: python\n\n @op\n def create_shell_command():\n return "echo hello world!"\n\n @graph\n def echo_graph():\n shell_op(create_shell_command())\n """\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output
\n\n\n
[docs]def create_shell_command_op(\n shell_command: str,\n name: str,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n) -> OpDefinition:\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at job construction time. If you'd like to construct shell commands dynamically during\n job execution and pass them between ops, you should use ``shell_op`` instead.\n\n The resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n\n @op(\n name=name,\n description=description,\n ins={"start": In(Nothing)},\n out=Out(str),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context, config: ShellOpConfig):\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_fn
\n\n\n
[docs]def create_shell_script_op(\n shell_script_path,\n name="create_shell_script_op",\n ins: Optional[Mapping[str, In]] = None,\n **kwargs: Any,\n) -> OpDefinition:\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n If no ``ins`` are passed then the resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (Optional[str]): The name of this op. Defaults to "create_shell_script_op".\n ins (Optional[Mapping[str, In]]): Ins for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_mapping_param(ins, "ins", value_type=In)\n\n if "config" in kwargs:\n raise TypeError("Overriding config for shell op is not supported.")\n\n @op(\n name=name,\n description=kwargs.pop("description", "An op to invoke a shell command."),\n ins=ins or {"start": In(Nothing)},\n out=Out(str),\n **kwargs,\n )\n def _shell_script_fn(context, config: ShellOpConfig):\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_script_fn
\n
", "current_page_name": "_modules/dagster_shell/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.ops"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.utils

\n#\n# NOTE: This file is based on the bash operator from Apache Airflow, which can be found here:\n# https://github.com/apache/airflow/blob/master/airflow/operators/bash.py\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# "License"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\nimport os\nimport signal\nfrom logging import Logger\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Mapping, Optional, Tuple\n\nimport dagster._check as check\nfrom dagster._utils import safe_tempfile_path\nfrom typing_extensions import Final\n\nOUTPUT_LOGGING_OPTIONS: Final = ["STREAM", "BUFFER", "NONE"]\n\n\ndef execute_script_file(\n    shell_script_path: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """Execute a shell script file specified by the argument ``shell_script_path``. The script will be\n    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_utility.py\n           :language: python\n\n    Args:\n        shell_script_path (str): The shell script to execute.\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Raises:\n        Exception: When an invalid output_logging is selected. Unreachable from op-based\n            invocation since the config system will check output_logging against the config\n            enum.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_script_path, "shell_script_path")\n    check.str_param(output_logging, "output_logging")\n    check.opt_str_param(cwd, "cwd", default=os.path.dirname(shell_script_path))\n    env = check.opt_nullable_dict_param(env, "env", key_type=str, value_type=str)\n\n    if output_logging not in OUTPUT_LOGGING_OPTIONS:\n        raise Exception("Unrecognized output_logging %s" % output_logging)\n\n    def pre_exec():\n        # Restore default signal disposition and invoke setsid\n        for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):\n            if hasattr(signal, sig):\n                signal.signal(getattr(signal, sig), signal.SIG_DFL)\n        os.setsid()\n\n    with open(shell_script_path, "rb") as f:\n        shell_command = f.read().decode("utf-8")\n\n    log.info(f"Running command:\\n{shell_command}")\n\n    sub_process = None\n    try:\n        stdout_pipe = PIPE\n        stderr_pipe = STDOUT\n        if output_logging == "NONE":\n            stdout_pipe = stderr_pipe = None\n\n        sub_process = Popen(\n            ["bash", shell_script_path],\n            stdout=stdout_pipe,\n            stderr=stderr_pipe,\n            cwd=cwd,\n            env=env,\n            preexec_fn=pre_exec,  # noqa: PLW1509\n            encoding="UTF-8",\n        )\n\n        log.info(f"Command pid: {sub_process.pid}")\n\n        output = ""\n        if output_logging == "STREAM":\n            assert sub_process.stdout is not None, "Setting stdout=PIPE should always set stdout."\n            # Stream back logs as they are emitted\n            lines = []\n            for line in sub_process.stdout:\n                log.info(line.rstrip())\n                lines.append(line)\n            output = "".join(lines)\n        elif output_logging == "BUFFER":\n            # Collect and buffer all logs, then emit\n            output, _ = sub_process.communicate()\n            log.info(output)\n\n        sub_process.wait()\n        log.info(f"Command exited with return code {sub_process.returncode}")\n\n        return output, sub_process.returncode\n    finally:\n        # Always terminate subprocess, including in cases where the run is terminated\n        if sub_process:\n            sub_process.terminate()\n\n\ndef execute(\n    shell_command: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\n    It can be used to execute shell commands on either op input data, or any data generated within a generic python op.\n\n    Internally, it executes a shell script specified by the argument ``shell_command``. The script will be written\n    to a temporary file first and invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_utility.py\n           :language: python\n\n    Args:\n        shell_command (str): The shell command to execute\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_command, "shell_command")\n    # other args checked in execute_file\n\n    with safe_tempfile_path() as tmp_file_path:\n        tmp_path = os.path.dirname(tmp_file_path)\n        log.info("Using temporary directory: %s" % tmp_path)\n\n        with open(tmp_file_path, "wb") as tmp_file:\n            tmp_file.write(shell_command.encode("utf-8"))\n            tmp_file.flush()\n            script_location = os.path.abspath(tmp_file.name)\n            log.info(f"Temporary script location: {script_location}")\n            return execute_script_file(\n                shell_script_path=tmp_file.name,\n                output_logging=output_logging,\n                log=log,\n                cwd=(cwd or tmp_path),\n                env=env,\n            )\n
", "current_page_name": "_modules/dagster_shell/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.utils"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\nfrom slack_sdk.web.client import WebClient\n\n\n
[docs]class SlackResource(ConfigurableResource):\n """This resource is for connecting to Slack.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import EnvVar, job, op\n from dagster_slack import SlackResource\n\n\n @op\n def slack_op(slack: SlackResource):\n slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job\n def slack_job():\n slack_op()\n\n defs = Definitions(\n jobs=[slack_job],\n resources={\n "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n },\n )\n """\n\n token: str = Field(\n description=(\n "To configure access to the Slack API, you'll need an access"\n " token provisioned with access to your Slack workspace."\n " Tokens are typically either user tokens or bot tokens. For programmatic posting"\n " to Slack from this resource, you probably want to provision and use a bot token."\n " More in the Slack API documentation here: https://api.slack.com/docs/token-types"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> WebClient:\n """Returns a ``slack_sdk.WebClient`` for interacting with the Slack API."""\n return WebClient(self.token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SlackResource.to_config_schema(),\n)\ndef slack_resource(context) -> WebClient:\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n """\n return SlackResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.sensors

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n)\n\nfrom dagster import (\n    AssetSelection,\n    DefaultSensorStatus,\n    FreshnessPolicySensorContext,\n    freshness_policy_sensor,\n)\nfrom dagster._annotations import deprecated_param, experimental\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\nfrom slack_sdk.web.client import WebClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nT = TypeVar("T", RunFailureSensorContext, FreshnessPolicySensorContext)\n\n\ndef _build_slack_blocks_and_text(\n    context: T,\n    text_fn: Callable[[T], str],\n    blocks_fn: Optional[Callable[[T], List[Dict[Any, Any]]]],\n    webserver_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    main_body_text = text_fn(context)\n    blocks: List[Dict[Any, Any]] = []\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        if isinstance(context, RunFailureSensorContext):\n            text = (\n                f'*Job "{context.dagster_run.job_name}" failed.'\n                f' `{context.dagster_run.run_id.split("-")[0]}`*'\n            )\n        else:\n            text = (\n                f'*Asset "{context.asset_key.to_user_string()}" is now'\n                f' {"on time" if context.minutes_overdue == 0 else f"{context.minutes_overdue:.2f} minutes late.*"}'\n            )\n\n        blocks.extend(\n            [\n                {\n                    "type": "section",\n                    "text": {\n                        "type": "mrkdwn",\n                        "text": text,\n                    },\n                },\n                {\n                    "type": "section",\n                    "text": {"type": "mrkdwn", "text": main_body_text},\n                },\n            ]\n        )\n\n    if webserver_base_url:\n        if isinstance(context, RunFailureSensorContext):\n            url = f"{webserver_base_url}/runs/{context.dagster_run.run_id}"\n        else:\n            url = f"{webserver_base_url}/assets/{'/'.join(context.asset_key.path)}"\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagster UI"},\n                        "url": url,\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(context: RunFailureSensorContext) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): The jobs in the\n current repository that will be monitored by this failure sensor. Defaults to None, which\n means the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): (deprecated in favor of monitored_jobs)\n The jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n\n Examples:\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.dagster_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n minimum_interval_seconds=minimum_interval_seconds,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n )\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n\n\ndef _default_freshness_message_text_fn(context: FreshnessPolicySensorContext) -> str:\n return (\n f"Asset `{context.asset_key.to_user_string()}` is now {context.minutes_overdue:.2f} minutes"\n " late."\n )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@experimental\ndef make_slack_on_freshness_policy_status_change_sensor(\n channel: str,\n slack_token: str,\n asset_selection: AssetSelection,\n warn_after_minutes_overdue: float = 0,\n notify_when_back_on_time: bool = False,\n text_fn: Callable[[FreshnessPolicySensorContext], str] = _default_freshness_message_text_fn,\n blocks_fn: Optional[Callable[[FreshnessPolicySensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor that will message the given Slack channel whenever an asset in the provided\n AssetSelection becomes out of date. Messages are only fired when the state changes, meaning\n only a single slack message will be sent (when the asset begins to be out of date). If\n `notify_when_back_on_time` is set to `True`, a second slack message will be sent once the asset\n is on time again.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n asset_selection (AssetSelection): The selection of assets which this sensor will monitor.\n Alerts will only be fired for assets that have a FreshnessPolicy defined.\n warn_after_minutes_overdue (float): How many minutes past the specified FreshnessPolicy this\n sensor will wait before firing an alert (by default, an alert will be fired as soon as\n the policy is violated).\n notify_when_back_on_time (bool): If a success message should be sent when the asset becomes on\n time again.\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``FreshnessPolicySensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains the relevant asset key, and the number of\n minutes past its defined freshness policy it currently is.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]): Function which takes in\n the ``FreshnessPolicySensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_freshness_policy".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n\n Examples:\n .. code-block:: python\n\n slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN"),\n )\n\n .. code-block:: python\n\n def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n if context.minutes_overdue == 0:\n return f"Asset {context.asset_key} is currently on time :)"\n return (\n f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n\n @freshness_policy_sensor(\n name=name, asset_selection=asset_selection, default_status=default_status\n )\n def slack_on_freshness_policy(context: FreshnessPolicySensorContext):\n if context.minutes_overdue is None or context.previous_minutes_overdue is None:\n return\n\n if (\n context.minutes_overdue > warn_after_minutes_overdue\n and context.previous_minutes_overdue <= warn_after_minutes_overdue\n ) or (\n notify_when_back_on_time\n and context.minutes_overdue == 0\n and context.previous_minutes_overdue != 0\n ):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_freshness_policy
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.ops

\nfrom dagster import (\n    Nothing,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.input import In\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        ins={"start": In(Nothing)},\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at job construction time. If you'd like to execute queries dynamically during\n    job execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(op, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.resources

\nimport base64\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\nfrom typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom cryptography.hazmat.backends import default_backend\nfrom cryptography.hazmat.primitives import serialization\nfrom dagster import (\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.storage.event_log.sql_event_log import SqlDbConnection\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field, root_validator, validator\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """A resource for connecting to the Snowflake data warehouse.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n object. If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import SnowflakeResource\n\n @op\n def get_one(snowflake_resource: SnowflakeResource):\n with snowflake_resource.get_connection() as conn:\n # conn is a snowflake.connector.Connection object\n conn.cursor().execute("SELECT 1")\n\n @job\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n resources={\n 'snowflake_resource': SnowflakeResource(\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n user=EnvVar("SNOWFLAKE_USER"),\n password=EnvVar("SNOWFLAKE_PASSWORD")\n database="MY_DATABASE",\n schema="MY_SCHEMA",\n warehouse="MY_WAREHOUSE"\n )\n }\n )\n """\n\n account: Optional[str] = Field(\n default=None,\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n\n user: str = Field(description="User login name.")\n\n password: Optional[str] = Field(default=None, description="User password.")\n\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use ``USE DATABASE`` "\n " to change the database."\n ),\n )\n\n schema_: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default schema to use. After login, you can use ``USE SCHEMA`` to "\n "change the schema."\n ),\n alias="schema",\n ) # schema is a reserved word for pydantic\n\n role: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default role to use. After login, you can use ``USE ROLE`` to change "\n " the role."\n ),\n )\n\n warehouse: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default warehouse to use. After login, you can use ``USE WAREHOUSE`` "\n "to change the role."\n ),\n )\n\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set private_key_path and private_key_password. To avoid issues with"\n " newlines in the keys, you can base64 encode the key. You can retrieve the base64"\n " encoded key with this shell command: ``cat rsa_key.p8 | base64``"\n ),\n )\n\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key password to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both ``private_key`` and ``private_key_path`` if the private key is"\n " encrypted. For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key path to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set the raw private key as ``private_key``."\n ),\n )\n\n autocommit: Optional[bool] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True "\n "or False to enable or disable autocommit mode in the session, respectively."\n ),\n )\n\n client_prefetch_threads: Optional[int] = Field(\n default=None,\n description=(\n "Number of threads used to download the results sets (4 by default). "\n "Increasing the value improves fetch performance but requires more memory."\n ),\n )\n\n client_session_keep_alive: Optional[bool] = Field(\n default=None,\n description=(\n "False by default. Set this to True to keep the session active indefinitely, "\n "even if there is no activity from the user. Make certain to call the close method to "\n "terminate the thread properly or the process may hang."\n ),\n )\n\n login_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for login. By default, 60 seconds. The login request gives "\n 'up after the timeout length if the HTTP response is "success".'\n ),\n )\n\n network_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for all other operations. By default, none/infinite. A general"\n " request gives up after the timeout length if the HTTP response is not 'success'."\n ),\n )\n\n ocsp_response_cache_filename: Optional[str] = Field(\n default=None,\n description=(\n "URI for the OCSP response cache file. By default, the OCSP response cache "\n "file is created in the cache directory."\n ),\n )\n\n validate_default_parameters: Optional[bool] = Field(\n default=None,\n description=(\n "If True, raise an exception if the warehouse, database, or schema doesn't exist."\n " Defaults to False."\n ),\n )\n\n paramstyle: Optional[str] = Field(\n default=None,\n description=(\n "pyformat by default for client side binding. Specify qmark or numeric to "\n "change bind variable formats for server side binding."\n ),\n )\n\n timezone: Optional[str] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter TIMEZONE. Set to a "\n "valid time zone (e.g. America/Los_Angeles) to set the session time zone."\n ),\n )\n\n connector: Optional[str] = Field(\n default=None,\n description=(\n "Indicate alternative database connection engine. Permissible option is "\n "'sqlalchemy' otherwise defaults to use the Snowflake Connector for Python."\n ),\n is_required=False,\n )\n\n cache_column_metadata: Optional[str] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a"\n " flag ``cache_column_metadata=True`` such that all of column metadata for all tables"\n ' are "cached"'\n ),\n )\n\n numpy: Optional[bool] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. To enable fetching "\n "NumPy data types, add numpy=True to the connection parameters."\n ),\n )\n\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @validator("paramstyle")\n def validate_paramstyle(cls, v: Optional[str]) -> Optional[str]:\n valid_config = ["pyformat", "qmark", "numeric"]\n if v is not None and v not in valid_config:\n raise ValueError(\n "Snowflake Resource: 'paramstyle' configuration value must be one of:"\n f" {','.join(valid_config)}."\n )\n return v\n\n @validator("connector")\n def validate_connector(cls, v: Optional[str]) -> Optional[str]:\n if v is not None and v != "sqlalchemy":\n raise ValueError(\n "Snowflake Resource: 'connector' configuration value must be None or sqlalchemy."\n )\n return v\n\n @root_validator\n def validate_authentication(cls, values):\n auths_set = 0\n auths_set += 1 if values.get("password") is not None else 0\n auths_set += 1 if values.get("private_key") is not None else 0\n auths_set += 1 if values.get("private_key_path") is not None else 0\n\n # if authenticator is set, there can be 0 or 1 additional auth method;\n # otherwise, ensure at least 1 method is provided\n check.invariant(\n auths_set > 0 or values.get("authenticator") is not None,\n "Missing config: Password, private key, or authenticator authentication required"\n " for Snowflake resource.",\n )\n\n # ensure that only 1 non-authenticator method is provided\n check.invariant(\n auths_set <= 1,\n "Incorrect config: Cannot provide both password and private key authentication to"\n " Snowflake Resource.",\n )\n\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _connection_args(self) -> Mapping[str, Any]:\n conn_args = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n "authenticator",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n if (\n self._resolved_config_dict.get("private_key", None) is not None\n or self._resolved_config_dict.get("private_key_path", None) is not None\n ):\n conn_args["private_key"] = self._snowflake_private_key(self._resolved_config_dict)\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_connection_args(self) -> Mapping[str, Any]:\n conn_args: Dict[str, Any] = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_engine_args(self) -> Mapping[str, Any]:\n config = self._resolved_config_dict\n sqlalchemy_engine_args = {}\n if (\n config.get("private_key", None) is not None\n or config.get("private_key_path", None) is not None\n ):\n # sqlalchemy passes private key args separately, so store them in a new dict\n sqlalchemy_engine_args["private_key"] = self._snowflake_private_key(config)\n if config.get("authenticator", None) is not None:\n sqlalchemy_engine_args["authenticator"] = config["authenticator"]\n\n return sqlalchemy_engine_args\n\n def _snowflake_private_key(self, config) -> bytes:\n # If the user has defined a path to a private key, we will use that.\n if config.get("private_key_path", None) is not None:\n # read the file from the path.\n with open(config.get("private_key_path"), "rb") as key:\n private_key = key.read()\n else:\n private_key = config.get("private_key", None)\n\n kwargs = {}\n if config.get("private_key_password", None) is not None:\n kwargs["password"] = config["private_key_password"].encode()\n else:\n kwargs["password"] = None\n\n try:\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except TypeError:\n try:\n private_key = base64.b64decode(private_key)\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except ValueError:\n raise ValueError(\n "Unable to load private key. You may need to base64 encode your private key."\n " You can retrieve the base64 encoded key with this shell command: cat"\n " rsa_key.p8 | base64"\n )\n\n pkb = p_key.private_bytes(\n encoding=serialization.Encoding.DER,\n format=serialization.PrivateFormat.PKCS8,\n encryption_algorithm=serialization.NoEncryption(),\n )\n\n return pkb\n\n @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__\n if raw_conn=True.\n\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op\n def get_query_status(snowflake: SnowflakeResource, query_id):\n with snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL\n from sqlalchemy import create_engine\n\n engine = create_engine(\n URL(**self._sqlalchemy_connection_args), connect_args=self._sqlalchemy_engine_args\n )\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self._connection_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()\n\n def get_object_to_set_on_execution_context(self) -> Any:\n # Directly create a SnowflakeConnection here for backcompat since the SnowflakeConnection\n # has methods this resource does not have\n return SnowflakeConnection(\n config=self._resolved_config_dict,\n log=get_dagster_logger(),\n snowflake_connection_resource=self,\n )
\n\n\n
[docs]class SnowflakeConnection:\n """A connection to Snowflake that can execute queries. In general this class should not be\n directly instantiated, but rather used as a resource in an op or asset via the\n :py:func:`snowflake_resource`.\n\n Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\n not use this SnowflakeConnection class.\n """\n\n def __init__(\n self, config: Mapping[str, str], log, snowflake_connection_resource: SnowflakeResource\n ):\n self.snowflake_connection_resource = snowflake_connection_resource\n self.log = log\n\n
[docs] @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\n you do not need to create a connection using this context manager.\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op(\n required_resource_keys={"snowflake"}\n )\n def get_query_status(query_id):\n with context.resources.snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n with self.snowflake_connection_resource.get_connection(raw_conn=raw_conn) as conn:\n yield conn
\n\n
[docs] @public\n def execute_query(\n self,\n sql: str,\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ):\n """Execute a query in Snowflake.\n\n Args:\n sql (str): the query to be executed\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to the query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the result of the query. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as a Pandas DataFrame.\n use_pandas_result (bool): If True, will return the result of the query as a Pandas DataFrame.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The result of the query if fetch_results or use_pandas_result is True, otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def drop_database(snowflake: SnowflakeResource):\n snowflake.execute_query(\n "DROP DATABASE IF EXISTS MY_DATABASE"\n )\n """\n check.str_param(sql, "sql")\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n return cursor.fetch_pandas_all()\n if fetch_results:\n return cursor.fetchall()
\n\n
[docs] @public\n def execute_queries(\n self,\n sql_queries: Sequence[str],\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ) -> Optional[Sequence[Any]]:\n """Execute multiple queries in Snowflake.\n\n Args:\n sql_queries (str): List of queries to be executed in series\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to every query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the results of the queries as a list. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as Pandas DataFrames.\n use_pandas_result (bool): If True, will return the results of the queries as a list of a Pandas DataFrames.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The results of the queries as a list if fetch_results or use_pandas_result is True,\n otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def create_fresh_database(snowflake: SnowflakeResource):\n queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n snowflake.execute_queries(\n sql_queries=queries\n )\n\n """\n check.sequence_param(sql_queries, "sql_queries", of_type=str)\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n results: List[Any] = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for raw_sql in sql_queries:\n sql = raw_sql.encode("utf-8") if sys.version_info[0] < 3 else raw_sql\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n results = results.append(cursor.fetch_pandas_all()) # type: ignore\n elif fetch_results:\n results.append(cursor.fetchall())\n\n return results if len(results) > 0 else None
\n\n
[docs] @public\n def load_table_from_local_parquet(self, src: str, table: str):\n """Stores the content of a parquet file to a Snowflake table.\n\n Args:\n src (str): the name of the file to store in Snowflake\n table (str): the name of the table to store the data. If the table does not exist, it will\n be created. Otherwise the contents of the table will be replaced with the data in src\n\n Examples:\n .. code-block:: python\n\n import pandas as pd\n import pyarrow as pa\n import pyarrow.parquet as pq\n\n @op\n def write_parquet_file(snowflake: SnowflakeResource):\n df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n table = pa.Table.from_pandas(df)\n pq.write_table(table, "example.parquet')\n snowflake.load_table_from_local_parquet(\n src="example.parquet",\n table="MY_TABLE"\n )\n\n """\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n f"CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);",\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n f"PUT {src} @%{table};",\n f"COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');",\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SnowflakeResource.to_config_schema(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context) -> SnowflakeConnection:\n """A resource for connecting to the Snowflake data warehouse. The returned resource object is an\n instance of :py:class:`SnowflakeConnection`.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n """\n snowflake_resource = SnowflakeResource.from_resource_context(context)\n return SnowflakeConnection(\n config=context, log=context.log, snowflake_connection_resource=snowflake_resource\n )
\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.resources"}, "snowflake_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.snowflake_io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom pydantic import Field\nfrom sqlalchemy.exc import ProgrammingError\n\nfrom .resources import SnowflakeResource\n\nSNOWFLAKE_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_snowflake_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=SnowflakeIOManager.to_config_schema())\n def snowflake_io_manager(init_context):\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return snowflake_io_manager
\n\n\n
[docs]class SnowflakeIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Name of the database to use.")\n account: str = Field(\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n user: str = Field(description="User login name.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n password: Optional[str] = Field(default=None, description="User password.")\n warehouse: Optional[str] = Field(default=None, description="Name of the warehouse to use.")\n role: Optional[str] = Field(default=None, description="Name of the role to use.")\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details. To"\n " avoid issues with newlines in the keys, you can base64 encode the key. You can"\n " retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64"\n ),\n )\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Path to the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n ),\n )\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "The password of the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both private_key and private_key_path if the private key is encrypted."\n " For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n store_timestamps_as_strings: bool = Field(\n default=False,\n description=(\n "If using Pandas DataFrames, whether to convert time data to strings. If True, time"\n " data will be converted to strings when storing the DataFrame and converted back to"\n " time data when loading the DataFrame. If False, time data without a timezone will be"\n " set to UTC timezone to avoid a Snowflake bug. Defaults to False."\n ),\n )\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n """type_handlers should return a list of the TypeHandlers that the I/O manager can use.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n """\n ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n """If an asset or op is not annotated with an return type, default_load_type will be used to\n determine which TypeHandler to use to store and load the output.\n\n If left unimplemented, default_load_type will return None. In that case, if there is only\n one TypeHandler, the I/O manager will default to loading unannotated outputs with that\n TypeHandler.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n import pandas as pd\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame\n """\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )
\n\n\nclass SnowflakeDbClient(DbClient):\n @staticmethod\n @contextmanager\n def connect(context, table_slice):\n no_schema_config = (\n {k: v for k, v in context.resource_config.items() if k != "schema"}\n if context.resource_config\n else {}\n )\n with SnowflakeResource(\n schema=table_slice.schema, connector="sqlalchemy", **no_schema_config\n ).get_connection(raw_conn=False) as conn:\n yield conn\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n schemas = connection.execute(\n f"show schemas like '{table_slice.schema}' in database {table_slice.database}"\n ).fetchall()\n if len(schemas) == 0:\n connection.execute(f"create schema {table_slice.schema};")\n\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except ProgrammingError:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"""\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n # Snowflake BETWEEN is inclusive; start <= partition expr <= end. We don't want to remove the next partition so we instead\n # write this as start <= partition expr < end.\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_snowflake/snowflake_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.snowflake_io_manager"}}, "dagster_snowflake_pandas": {"snowflake_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pandas.snowflake_pandas_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport pandas as pd\nimport pandas.core.dtypes.common as pd_core_dtypes_common\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient, SnowflakeIOManager\nfrom snowflake.connector.pandas_tools import pd_writer\n\n\ndef _table_exists(table_slice: TableSlice, connection):\n    tables = connection.execute(\n        f"SHOW TABLES LIKE '{table_slice.table}' IN SCHEMA"\n        f" {table_slice.database}.{table_slice.schema}"\n    ).fetchall()\n    return len(tables) > 0\n\n\ndef _get_table_column_types(table_slice: TableSlice, connection) -> Optional[Mapping[str, str]]:\n    if _table_exists(table_slice, connection):\n        schema_list = connection.execute(f"DESCRIBE TABLE {table_slice.table}").fetchall()\n        return {item[0]: item[1] for item in schema_list}\n\n\ndef _convert_timestamp_to_string(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    """Converts columns of data of type pd.Timestamp to string so that it can be stored in\n    snowflake.\n    """\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" not in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    "Snowflake I/O manager: Snowflake I/O manager configured to convert time data"\n                    f" in DataFrame column {column_name} to strings, but the corresponding"\n                    f" {column_name.upper()} column in table {table_name} is not of type VARCHAR,"\n                    f" it is of type {column_types[column_name]}. Please set"\n                    " store_timestamps_as_strings=False in the Snowflake I/O manager configuration"\n                    " to store time data as TIMESTAMP types."\n                )\n        return s.dt.strftime("%Y-%m-%d %H:%M:%S.%f %z")\n    else:\n        return s\n\n\ndef _convert_string_to_timestamp(s: pd.Series) -> pd.Series:\n    """Converts columns of strings in Timestamp format to pd.Timestamp to undo the conversion in\n    _convert_timestamp_to_string.\n\n    This will not convert non-timestamp strings into timestamps (pd.to_datetime will raise an\n    exception if the string cannot be converted)\n    """\n    if isinstance(s[0], str):\n        try:\n            return pd.to_datetime(s.values)  # type: ignore  # (bad stubs)\n        except ValueError:\n            return s\n    else:\n        return s\n\n\ndef _add_missing_timezone(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    f"Snowflake I/O manager: The Snowflake column {column_name.upper()} in table"\n                    f" {table_name} is of type {column_types[column_name]} and should be of type"\n                    f" TIMESTAMP to store the time data in dataframe column {column_name}. Please"\n                    " migrate this column to be of time TIMESTAMP_NTZ(9) to store time data."\n                )\n        return s.dt.tz_localize("UTC")\n    return s\n\n\n
[docs]class SnowflakePandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ) -> Mapping[str, RawMetadataValue]:\n from snowflake import connector\n\n connector.paramstyle = "pyformat"\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n column_types = _get_table_column_types(table_slice, connection)\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _convert_timestamp_to_string(x, column_types, table_slice.table),\n axis="index",\n )\n else:\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _add_missing_timezone(x, column_types, table_slice.table), axis="index"\n )\n with_uppercase_cols.to_sql(\n table_slice.table,\n con=connection.engine,\n if_exists="append",\n index=False,\n method=pd_writer,\n )\n\n return {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=str(name), type=str(dtype))\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = pd.read_sql(\n sql=SnowflakeDbClient.get_select_statement(table_slice), con=connection\n )\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n result = result.apply(_convert_string_to_timestamp, axis="index")\n result.columns = map(str.lower, result.columns) # type: ignore # (bad stubs)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nsnowflake_pandas_io_manager = build_snowflake_io_manager(\n [SnowflakePandasTypeHandler()], default_load_type=pd.DataFrame\n)\nsnowflake_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pandas import snowflake_pandas_io_manager\n from dagster import asset, Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pandas_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePandasIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\n using the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pandas import SnowflakePandasIOManager\n from dagster import asset, Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pandas/snowflake_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pandas.snowflake_pandas_type_handler"}}, "dagster_snowflake_pyspark": {"snowflake_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pyspark.snowflake_pyspark_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\nSNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"\n\n\ndef _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]:\n    check.invariant(\n        config.get("warehouse", None) is not None,\n        "Missing config: Warehouse is required when using PySpark with the Snowflake I/O manager.",\n    )\n\n    conf = {\n        "sfURL": f"{config['account']}.snowflakecomputing.com",\n        "sfUser": config["user"],\n        "sfPassword": config["password"],\n        "sfDatabase": config["database"],\n        "sfSchema": table_slice.schema,\n        "sfWarehouse": config["warehouse"],\n    }\n\n    return conf\n\n\n
[docs]class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(\n "dbtable", table_slice.table\n ).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format(SNOWFLAKE_CONNECTOR)\n .options(**options)\n .option("query", SnowflakeDbClient.get_select_statement(table_slice))\n .load()\n )\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nsnowflake_pyspark_io_manager = build_snowflake_io_manager(\n [SnowflakePySparkTypeHandler()], default_load_type=DataFrame\n)\nsnowflake_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\n from pyspark.sql import DataFrame\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pyspark_io_manager.configured({\n "database": "my_database",\n "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n "password": {"env": "SNOWFLAKE_PASSWORD"},\n ...\n })\n }\n )\n\n Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePySparkIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\n using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pyspark import SnowflakePySparkIOManager\n from pyspark.sql import DataFrame\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePySparkIOManager(\n database="my_database",\n warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n password=EnvVar("SNOWFLAKE_PASSWORD"),\n ...\n )\n }\n )\n\n Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pyspark/snowflake_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pyspark.snowflake_pyspark_type_handler"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration.\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description=(\n "The path to your spark installation. Defaults to $SPARK_HOME at runtime if not"\n " provided."\n ),\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.ops

\nfrom dagster import (\n    In,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\n\nfrom .configs import define_spark_config\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @op(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n ins={"start": In(Nothing)},\n out=Out(Nothing),\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_op(context):\n context.resources.spark.run_spark_job(context.op_config, main_class)\n\n return spark_op
\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                f"Application jar {application_jar} does not exist. A valid jar must be "\n                "built before running this op."\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@dagster_maintained_resource\n@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\n
[docs]class SparkOpError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nimport dagster._check as check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """Convert spark conf dict to list of CLI arguments.\n\n    For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n [f"{spark_home}/bin/spark-submit", "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom dagster import (\n    BoolSource,\n    Field,\n    IntSource,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils import mkdir_p\nfrom dagster._utils.merger import merge_dicts\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\nclass SSHResource:\n    """Resource for ssh remote execution using Paramiko.\n\n    ref: https://github.com/paramiko/paramiko\n    """\n\n    def __init__(\n        self,\n        remote_host,\n        remote_port,\n        username=None,\n        password=None,\n        key_file=None,\n        key_string=None,\n        timeout=10,\n        keepalive_interval=30,\n        compress=True,\n        no_host_key_check=True,\n        allow_host_key_change=False,\n        logger=None,\n    ):\n        self.remote_host = check.str_param(remote_host, "remote_host")\n        self.remote_port = check.opt_int_param(remote_port, "remote_port")\n        self.username = check.opt_str_param(username, "username")\n        self.password = check.opt_str_param(password, "password")\n        self.key_file = check.opt_str_param(key_file, "key_file")\n        self.timeout = check.opt_int_param(timeout, "timeout")\n        self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n        self.compress = check.opt_bool_param(compress, "compress")\n        self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n        self.log = logger\n\n        self.host_proxy = None\n\n        # Create RSAKey object from private key string\n        self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n        # Auto detecting username values from system\n        if not self.username:\n            logger.debug(\n                "username to ssh to host: %s is not specified. Using system's default provided by"\n                " getpass.getuser()"\n                % self.remote_host\n            )\n            self.username = getpass.getuser()\n\n        user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n        if os.path.isfile(user_ssh_config_filename):\n            ssh_conf = paramiko.SSHConfig()\n            ssh_conf.parse(open(user_ssh_config_filename, encoding="utf8"))\n            host_info = ssh_conf.lookup(self.remote_host)\n            if host_info and host_info.get("proxycommand"):\n                self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n            if not (self.password or self.key_file):\n                if host_info and host_info.get("identityfile"):\n                    self.key_file = host_info.get("identityfile")[0]\n\n    def get_connection(self):\n        """Opens a SSH connection to the remote host.\n\n        :rtype: paramiko.client.SSHClient\n        """\n        client = paramiko.SSHClient()\n        client.load_system_host_keys()\n        if self.no_host_key_check:\n            self.log.warning(\n                "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n            )\n            # Default is RejectPolicy\n            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n        if self.password and self.password.strip():\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                password=self.password,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n                look_for_keys=False,\n            )\n        else:\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n            )\n\n        if self.keepalive_interval:\n            client.get_transport().set_keepalive(self.keepalive_interval)\n\n        return client\n\n    def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n        check.int_param(remote_port, "remote_port")\n        check.str_param(remote_host, "remote_host")\n        check.opt_int_param(local_port, "local_port")\n\n        if local_port is not None:\n            local_bind_address = ("localhost", local_port)\n        else:\n            local_bind_address = ("localhost",)\n\n        # Will prefer key string if specified, otherwise use the key file\n        pkey = self.key_obj if self.key_obj else self.key_file\n\n        if self.password and self.password.strip():\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_password=self.password,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                logger=self.log,\n            )\n        else:\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                host_pkey_directories=[],\n                logger=self.log,\n            )\n\n        return client\n\n    def sftp_get(self, remote_filepath, local_filepath):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            local_folder = os.path.dirname(local_filepath)\n\n            # Create intermediate directories if they don't exist\n            mkdir_p(local_folder)\n\n            self.log.info(f"Starting to transfer from {remote_filepath} to {local_filepath}")\n\n            sftp_client.get(remote_filepath, local_filepath)\n\n        conn.close()\n        return local_filepath\n\n    def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            self.log.info(f"Starting to transfer file from {local_filepath} to {remote_filepath}")\n\n            sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n        conn.close()\n        return local_filepath\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n IntSource,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n IntSource,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n IntSource,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(BoolSource, is_required=False, default_value=True),\n "no_host_key_check": Field(BoolSource, is_required=False, default_value=True),\n "allow_host_key_change": Field(\n BoolSource, description="[Deprecated]", is_required=False, default_value=False\n ),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_twilio.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom pydantic import Field\nfrom twilio.rest import Client\n\n\n
[docs]class TwilioResource(ConfigurableResource):\n """This resource is for connecting to Twilio."""\n\n account_sid: str = Field(\n description=(\n "Twilio Account SID, created with yout Twilio account. This can be found on your Twilio"\n " dashboard, see"\n " https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n auth_token: str = Field(\n description=(\n "Twilio Authentication Token, created with yout Twilio account. This can be found on"\n " your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_client(self) -> Client:\n return Client(self.account_sid, self.auth_token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=TwilioResource.to_config_schema(),\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context: InitResourceContext) -> Client:\n return TwilioResource.from_resource_context(context).create_client()
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_twilio.resources"}}, "dagster_wandb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.io_manager

\nimport datetime\nimport os\nimport pickle\nimport platform\nimport shutil\nimport sys\nimport time\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import List, Optional\n\nfrom dagster import (\n    Field,\n    InitResourceContext,\n    InputContext,\n    Int,\n    IOManager,\n    MetadataValue,\n    OutputContext,\n    String,\n    io_manager,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom wandb import Artifact\nfrom wandb.data_types import WBValue\n\nfrom .resources import WANDB_CLOUD_HOST\nfrom .utils.errors import (\n    WandbArtifactsIOManagerError,\n    raise_on_empty_configuration,\n    raise_on_unknown_partition_keys,\n    raise_on_unknown_read_configuration_keys,\n    raise_on_unknown_write_configuration_keys,\n)\nfrom .utils.pickling import (\n    ACCEPTED_SERIALIZATION_MODULES,\n    pickle_artifact_content,\n    unpickle_artifact_content,\n)\nfrom .version import __version__\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\n\nclass Config(TypedDict):\n    dagster_run_id: str\n    wandb_host: str\n    wandb_entity: str\n    wandb_project: str\n    wandb_run_name: Optional[str]\n    wandb_run_id: Optional[str]\n    wandb_run_tags: Optional[List[str]]\n    base_dir: str\n    cache_duration_in_minutes: Optional[int]\n\n\nclass ArtifactsIOManager(IOManager):\n    """IO Manager to handle Artifacts in Weights & Biases (W&B) .\n\n    It handles 3 different inputs:\n    - Pickable objects (the serialization module is configurable)\n    - W&B Objects (Audio, Table, Image, etc)\n    - W&B Artifacts\n    """\n\n    def __init__(self, wandb_client, config: Config):\n        self.wandb = wandb_client\n\n        dagster_run_id = config["dagster_run_id"]\n        self.dagster_run_id = dagster_run_id\n        self.wandb_host = config["wandb_host"]\n        self.wandb_entity = config["wandb_entity"]\n        self.wandb_project = config["wandb_project"]\n        self.wandb_run_id = config.get("wandb_run_id") or dagster_run_id\n        self.wandb_run_name = config.get("wandb_run_name") or f"dagster-run-{dagster_run_id[0:8]}"\n        # augments the run tags\n        wandb_run_tags = config["wandb_run_tags"] or []\n        if "dagster_wandb" not in wandb_run_tags:\n            wandb_run_tags = [*wandb_run_tags, "dagster_wandb"]\n        self.wandb_run_tags = wandb_run_tags\n\n        self.base_dir = config["base_dir"]\n        cache_duration_in_minutes = config["cache_duration_in_minutes"]\n        default_cache_expiration_in_minutes = 60 * 24 * 30  # 60 minutes * 24 hours * 30 days\n        self.cache_duration_in_minutes = (\n            cache_duration_in_minutes\n            if cache_duration_in_minutes is not None\n            else default_cache_expiration_in_minutes\n        )\n\n    def _get_local_storage_path(self):\n        path = self.base_dir\n        if os.path.basename(path) != "storage":\n            path = os.path.join(path, "storage")\n        path = os.path.join(path, "wandb_artifacts_manager")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_artifacts_path(self, name, version):\n        local_storage_path = self._get_local_storage_path()\n        path = os.path.join(local_storage_path, "artifacts", f"{name}.{version}")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_wandb_logs_path(self):\n        local_storage_path = self._get_local_storage_path()\n        # Adding a random uuid to avoid collisions in multi-process context\n        path = os.path.join(local_storage_path, "runs", self.dagster_run_id, str(uuid.uuid4()))\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _clean_local_storage_path(self):\n        local_storage_path = self._get_local_storage_path()\n        cache_duration_in_minutes = self.cache_duration_in_minutes\n        current_timestamp = int(time.time())\n        expiration_timestamp = current_timestamp - (\n            cache_duration_in_minutes * 60  # convert to seconds\n        )\n\n        for root, dirs, files in os.walk(local_storage_path, topdown=False):\n            for name in files:\n                current_file_path = os.path.join(root, name)\n                most_recent_access = os.lstat(current_file_path).st_atime\n                if most_recent_access <= expiration_timestamp or cache_duration_in_minutes == 0:\n                    os.remove(current_file_path)\n            for name in dirs:\n                current_dir_path = os.path.join(root, name)\n                if not os.path.islink(current_dir_path):\n                    if len(os.listdir(current_dir_path)) == 0 or cache_duration_in_minutes == 0:\n                        shutil.rmtree(current_dir_path)\n\n    @contextmanager\n    def wandb_run(self):\n        self.wandb.init(\n            id=self.wandb_run_id,\n            name=self.wandb_run_name,\n            project=self.wandb_project,\n            entity=self.wandb_entity,\n            dir=self._get_wandb_logs_path(),\n            tags=self.wandb_run_tags,\n            anonymous="never",\n            resume="allow",\n        )\n        try:\n            yield self.wandb.run\n        finally:\n            self.wandb.finish()\n            self._clean_local_storage_path()\n\n    def _upload_artifact(self, context: OutputContext, obj):\n        if not context.has_partition_key and context.has_asset_partitions:\n            raise WandbArtifactsIOManagerError(\n                "Sorry, but the Weights & Biases (W&B) IO Manager can't handle processing several"\n                " partitions at the same time within a single run. Please process each partition"\n                " separately. If you think this might be an error, don't hesitate to reach out to"\n                " Weights & Biases Support."\n            )\n\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_write_configuration_keys(parameters)\n\n            serialization_module = parameters.get("serialization_module", {})\n            serialization_module_name = serialization_module.get("name", "pickle")\n\n            if serialization_module_name not in ACCEPTED_SERIALIZATION_MODULES:\n                raise WandbArtifactsIOManagerError(\n                    f"Oops! It looks like the value you provided, '{serialization_module_name}',"\n                    " isn't recognized as a valid serialization module. Here are the ones we do"\n                    f" support: {ACCEPTED_SERIALIZATION_MODULES}."\n                )\n\n            serialization_module_parameters = serialization_module.get("parameters", {})\n            serialization_module_parameters_with_protocol = {\n                "protocol": (\n                    pickle.HIGHEST_PROTOCOL\n                ),  # we use the highest available protocol if we don't pass one\n                **serialization_module_parameters,\n            }\n\n            artifact_type = parameters.get("type", "artifact")\n            artifact_description = parameters.get("description")\n            artifact_metadata = {\n                "source_integration": "dagster_wandb",\n                "source_integration_version": __version__,\n                "source_dagster_run_id": self.dagster_run_id,\n                "source_created_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),\n                "source_python_version": platform.python_version(),\n            }\n            if isinstance(obj, Artifact):\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'name' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'name' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if parameters.get("type") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'type' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'type' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if obj.name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The Weights & Biases (W&B) Artifact you provided is missing a name."\n                        " Please, assign a name to your Artifact."\n                    )\n\n                if context.has_asset_key and obj.name != context.get_asset_identifier()[0]:\n                    asset_identifier = context.get_asset_identifier()[0]\n                    context.log.warning(\n                        f"Please note, the name '{obj.name}' of your Artifact is overwritten by the"\n                        f" name derived from the AssetKey '{asset_identifier}'. For consistency and"\n                        " to avoid confusion, we advise sharing a constant for both your asset's"\n                        " name and the artifact's name."\n                    )\n                    obj._name = asset_identifier  # noqa: SLF001\n\n                if context.has_partition_key:\n                    artifact_name = f"{obj.name}.{context.partition_key}"\n                    # The Artifact provided is produced in a partitioned execution we add the\n                    # partition as a suffix to the Artifact name\n                    obj._name = artifact_name  # noqa: SLF001\n\n                if len(serialization_module) != 0:  # not an empty dict\n                    context.log.warning(\n                        "You've included a 'serialization_module' in the"\n                        " 'wandb_artifact_configuration' settings. However, this doesn't have any"\n                        " impact when the output is already an Artifact object."\n                    )\n\n                # The obj is already an Artifact we augment its metadata\n                artifact = obj\n\n                artifact.metadata = {**artifact.metadata, **artifact_metadata}\n\n                if artifact.description is not None and artifact_description is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a 'description' in the 'wandb_artifact_configuration'"\n                        " settings for an existing Artifact that already has a description. Please,"\n                        " either set the description using 'wandb_artifact_argument' or when"\n                        " creating your Artifact."\n                    )\n                if artifact_description is not None:\n                    artifact.description = artifact_description\n            else:\n                if context.has_asset_key:\n                    if parameters.get("name") is not None:\n                        raise WandbArtifactsIOManagerError(\n                            "You've included a 'name' property in the"\n                            " 'wandb_artifact_configuration' settings. But, a 'name' is only needed"\n                            " when there's no 'AssetKey'. When an Artifact is created from an"\n                            " @asset, it uses the asset name. When it's created from an @op with an"\n                            " 'asset_key' for the output, that value is used. Please remove the"\n                            " 'name' property."\n                        )\n                    artifact_name = context.get_asset_identifier()[0]  # name of asset\n                else:\n                    name_parameter = parameters.get("name")\n                    if name_parameter is None:\n                        raise WandbArtifactsIOManagerError(\n                            "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                            " settings. For Artifacts created from an @op, a 'name' property is"\n                            " needed. You could also use an @asset as an alternative."\n                        )\n                    assert name_parameter is not None\n                    artifact_name = name_parameter\n\n                if context.has_partition_key:\n                    artifact_name = f"{artifact_name}.{context.partition_key}"\n\n                # We replace the | character with - because it is not allowed in artifact names\n                # The | character is used in multi-dimensional partition keys\n                artifact_name = str(artifact_name).replace("|", "-")\n\n                # Creates an artifact to hold the obj\n                artifact = self.wandb.Artifact(\n                    name=artifact_name,\n                    type=artifact_type,\n                    description=artifact_description,\n                    metadata=artifact_metadata,\n                )\n                if isinstance(obj, WBValue):\n                    if len(serialization_module) != 0:  # not an empty dict\n                        context.log.warning(\n                            "You've included a 'serialization_module' in the"\n                            " 'wandb_artifact_configuration' settings. However, this doesn't have"\n                            " any impact when the output is already an W&B object like e.g Table or"\n                            " Image."\n                        )\n                    # Adds the WBValue object using the class name as the name for the file\n                    artifact.add(obj, obj.__class__.__name__)\n                elif obj is not None:\n                    # The output is not a native wandb Object, we serialize it\n                    pickle_artifact_content(\n                        context,\n                        serialization_module_name,\n                        serialization_module_parameters_with_protocol,\n                        artifact,\n                        obj,\n                    )\n\n            # Add any files: https://docs.wandb.ai/ref/python/artifact#add_file\n            add_files = parameters.get("add_files")\n            if add_files is not None and len(add_files) > 0:\n                for add_file in add_files:\n                    artifact.add_file(**add_file)\n\n            # Add any dirs: https://docs.wandb.ai/ref/python/artifact#add_dir\n            add_dirs = parameters.get("add_dirs")\n            if add_dirs is not None and len(add_dirs) > 0:\n                for add_dir in add_dirs:\n                    artifact.add_dir(**add_dir)\n\n            # Add any reference: https://docs.wandb.ai/ref/python/artifact#add_reference\n            add_references = parameters.get("add_references")\n            if add_references is not None and len(add_references) > 0:\n                for add_reference in add_references:\n                    artifact.add_reference(**add_reference)\n\n            # Augments the aliases\n            aliases = parameters.get("aliases", [])\n            aliases.append(f"dagster-run-{self.dagster_run_id[0:8]}")\n            if "latest" not in aliases:\n                aliases.append("latest")\n\n            # Logs the artifact\n            self.wandb.log_artifact(artifact, aliases=aliases)\n            artifact.wait()\n\n            # Adds useful metadata to the output or Asset\n            artifacts_base_url = (\n                "https://wandb.ai"\n                if self.wandb_host == WANDB_CLOUD_HOST\n                else self.wandb_host.rstrip("/")\n            )\n            assert artifact.id is not None\n            output_metadata = {\n                "dagster_run_id": MetadataValue.dagster_run(self.dagster_run_id),\n                "wandb_artifact_id": MetadataValue.text(artifact.id),\n                "wandb_artifact_type": MetadataValue.text(artifact.type),\n                "wandb_artifact_version": MetadataValue.text(artifact.version),\n                "wandb_artifact_size": MetadataValue.int(artifact.size),\n                "wandb_artifact_url": MetadataValue.url(\n                    f"{artifacts_base_url}/{run.entity}/{run.project}/artifacts/{artifact.type}/{'/'.join(artifact.name.rsplit(':', 1))}"\n                ),\n                "wandb_entity": MetadataValue.text(run.entity),\n                "wandb_project": MetadataValue.text(run.project),\n                "wandb_run_id": MetadataValue.text(run.id),\n                "wandb_run_name": MetadataValue.text(run.name),\n                "wandb_run_path": MetadataValue.text(run.path),\n                "wandb_run_url": MetadataValue.url(run.url),\n            }\n            context.add_output_metadata(output_metadata)\n\n    def _download_artifact(self, context: InputContext):\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_read_configuration_keys(parameters)\n\n            partitions_configuration = parameters.get("partitions", {})\n\n            if not context.has_asset_partitions and len(partitions_configuration) > 0:\n                raise WandbArtifactsIOManagerError(\n                    "You've included a 'partitions' value in the 'wandb_artifact_configuration'"\n                    " settings but it's not within a partitioned execution. Please only use"\n                    " 'partitions' within a partitioned context."\n                )\n\n            if context.has_asset_partitions:\n                # Note: this is currently impossible to unit test with current Dagster APIs but was\n                # tested thoroughly manually\n                name = parameters.get("get")\n                path = parameters.get("get_path")\n                if name is not None or path is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a value for 'get' and/or 'get_path' in the"\n                        " 'wandb_artifact_configuration' settings during a partitioned execution."\n                        " Please use the 'partitions' property to set 'get' or 'get_path' for each"\n                        " individual partition. To set a default value for all partitions, use '*'."\n                    )\n\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    artifact_name = context.asset_key[0][0]  # name of asset\n\n                partitions = [\n                    (key, f"{artifact_name}.{ str(key).replace('|', '-')}")\n                    for key in context.asset_partition_keys\n                ]\n\n                output = {}\n\n                for key, artifact_name in partitions:\n                    context.log.info(f"Handling partition with key '{key}'")\n                    partition_configuration = partitions_configuration.get(\n                        key, partitions_configuration.get("*")\n                    )\n\n                    raise_on_empty_configuration(key, partition_configuration)\n                    raise_on_unknown_partition_keys(key, partition_configuration)\n\n                    partition_version = None\n                    partition_alias = None\n                    if partition_configuration and partition_configuration is not None:\n                        partition_version = partition_configuration.get("version")\n                        partition_alias = partition_configuration.get("alias")\n                        if partition_version is not None and partition_alias is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'version' and 'alias' for the partition with"\n                                " key '{key}'. You should only use one of these properties at a"\n                                " time. If you choose not to use any, the latest version will be"\n                                " used by default. If this partition is configured with the '*'"\n                                " key, please correct the wildcard configuration."\n                            )\n                    partition_identifier = partition_version or partition_alias or "latest"\n\n                    artifact_uri = (\n                        f"{run.entity}/{run.project}/{artifact_name}:{partition_identifier}"\n                    )\n                    try:\n                        api = self.wandb.Api()\n                        api.artifact(artifact_uri)\n                    except Exception as exception:\n                        raise WandbArtifactsIOManagerError(\n                            "The artifact you're attempting to download might not exist, or you"\n                            " might have forgotten to include the 'name' property in the"\n                            " 'wandb_artifact_configuration' settings."\n                        ) from exception\n\n                    artifact = run.use_artifact(artifact_uri)\n\n                    artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n                    if partition_configuration and partition_configuration is not None:\n                        partition_name = partition_configuration.get("get")\n                        partition_path = partition_configuration.get("get_path")\n                        if partition_name is not None and partition_path is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'get' and 'get_path' in the"\n                                " 'wandb_artifact_configuration' settings for the partition with"\n                                " key '{key}'. Only one of these properties should be used. If you"\n                                " choose not to use any, the whole Artifact will be returned. If"\n                                " this partition is configured with the '*' key, please correct the"\n                                " wildcard configuration."\n                            )\n\n                        if partition_name is not None:\n                            wandb_object = artifact.get(partition_name)\n                            if wandb_object is not None:\n                                output[key] = wandb_object\n                                continue\n\n                        if partition_path is not None:\n                            path = artifact.get_path(partition_path)\n                            download_path = path.download(root=artifacts_path)\n                            if download_path is not None:\n                                output[key] = download_path\n                                continue\n\n                    artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n                    unpickled_content = unpickle_artifact_content(artifact_dir)\n                    if unpickled_content is not None:\n                        output[key] = unpickled_content\n                        continue\n\n                    artifact.verify(root=artifacts_path)\n                    output[key] = artifact\n\n                if len(output) == 1:\n                    # If there's only one partition, return the value directly\n                    return next(iter(output.values()))\n\n                return output\n\n            elif context.has_asset_key:\n                # Input is an asset\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "A conflict has been detected in the provided configuration settings. The"\n                        " 'name' parameter appears to be specified twice - once in the"\n                        " 'wandb_artifact_configuration' metadata dictionary, and again as an"\n                        " AssetKey. Kindly avoid setting the name directly, since the AssetKey will"\n                        " be used for this purpose."\n                    )\n                artifact_name = context.get_asset_identifier()[0]  # name of asset\n            else:\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                        " settings. For Artifacts used in an @op, a 'name' property is required."\n                        " You could use an @asset as an alternative."\n                    )\n\n            if context.has_partition_key:\n                artifact_name = f"{artifact_name}.{context.partition_key}"\n\n            artifact_alias = parameters.get("alias")\n            artifact_version = parameters.get("version")\n\n            if artifact_alias is not None and artifact_version is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'version' and 'alias' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the latest version will be applied"\n                    " automatically."\n                )\n\n            artifact_identifier = artifact_alias or artifact_version or "latest"\n            artifact_uri = f"{run.entity}/{run.project}/{artifact_name}:{artifact_identifier}"\n\n            # This try/except block is a workaround for a bug in the W&B SDK, this should be removed\n            # once the bug is fixed.\n            try:\n                artifact = run.use_artifact(artifact_uri)\n            except Exception:\n                api = self.wandb.Api()\n                artifact = api.artifact(artifact_uri)\n\n            name = parameters.get("get")\n            path = parameters.get("get_path")\n            if name is not None and path is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'get' and 'get_path' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the entire Artifact will be returned."\n                )\n\n            if name is not None:\n                return artifact.get(name)\n\n            artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n            if path is not None:\n                path = artifact.get_path(path)\n                return path.download(root=artifacts_path)\n\n            artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n\n            unpickled_content = unpickle_artifact_content(artifact_dir)\n            if unpickled_content is not None:\n                return unpickled_content\n\n            artifact.verify(root=artifacts_path)\n            return artifact\n\n    def handle_output(self, context: OutputContext, obj) -> None:\n        if obj is None:\n            context.log.warning(\n                "The output value given to the Weights & Biases (W&B) IO Manager is empty. If this"\n                " was intended, you can disregard this warning."\n            )\n        else:\n            try:\n                self._upload_artifact(context, obj)\n            except WandbArtifactsIOManagerError as exception:\n                raise exception\n            except Exception as exception:\n                raise WandbArtifactsIOManagerError() from exception\n\n    def load_input(self, context: InputContext):\n        try:\n            return self._download_artifact(context)\n        except WandbArtifactsIOManagerError as exception:\n            raise exception\n        except Exception as exception:\n            raise WandbArtifactsIOManagerError() from exception\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n required_resource_keys={"wandb_resource", "wandb_config"},\n description="IO manager to read and write W&B Artifacts",\n config_schema={\n "run_name": Field(\n String,\n is_required=False,\n description=(\n "Short display name for this run, which is how you'll identify this run in the UI."\n " By default, it`s set to a string with the following format dagster-run-[8 first"\n " characters of the Dagster Run ID] e.g. dagster-run-7e4df022."\n ),\n ),\n "run_id": Field(\n String,\n is_required=False,\n description=(\n "Unique ID for this run, used for resuming. It must be unique in the project, and"\n " if you delete a run you can't reuse the ID. Use the name field for a short"\n " descriptive name, or config for saving hyperparameters to compare across runs."\n r" The ID cannot contain the following special characters: /\\#?%:.. You need to set"\n " the Run ID when you are doing experiment tracking inside Dagster to allow the IO"\n " Manager to resume the run. By default it`s set to the Dagster Run ID e.g "\n " 7e4df022-1bf2-44b5-a383-bb852df4077e."\n ),\n ),\n "run_tags": Field(\n [String],\n is_required=False,\n description=(\n "A list of strings, which will populate the list of tags on this run in the UI."\n " Tags are useful for organizing runs together, or applying temporary labels like"\n " 'baseline' or 'production'. It's easy to add and remove tags in the UI, or filter"\n " down to just runs with a specific tag. Any W&B Run used by the integration will"\n " have the dagster_wandb tag."\n ),\n ),\n "base_dir": Field(\n String,\n is_required=False,\n description=(\n "Base directory used for local storage and caching. W&B Artifacts and W&B Run logs"\n " will be written and read from that directory. By default, it`s using the"\n " DAGSTER_HOME directory."\n ),\n ),\n "cache_duration_in_minutes": Field(\n Int,\n is_required=False,\n description=(\n "Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the"\n " local storage. Only files and directories that were not opened for that amount of"\n " time are removed from the cache. Cache purging happens at the end of an IO"\n " Manager execution. You can set it to 0, if you want to disable caching"\n " completely. Caching improves speed when an Artifact is reused between jobs"\n " running on the same machine. It defaults to 30 days."\n ),\n ),\n },\n)\ndef wandb_artifacts_io_manager(context: InitResourceContext):\n """Dagster IO Manager to create and consume W&B Artifacts.\n\n It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n **Example:**\n\n .. code-block:: python\n\n @repository\n def my_repository():\n return [\n *with_resources(\n load_assets_from_current_module(),\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n {"cache_duration_in_minutes": 60} # only cache files for one hour\n ),\n },\n resource_config_by_key={\n "wandb_config": {\n "config": {\n "entity": "my_entity",\n "project": "my_project"\n }\n }\n },\n ),\n ]\n\n\n @asset(\n name="my_artifact",\n metadata={\n "wandb_artifact_configuration": {\n "type": "dataset",\n }\n },\n io_manager_key="wandb_artifacts_manager",\n )\n def create_dataset():\n return [1, 2, 3]\n\n """\n wandb_client = context.resources.wandb_resource["sdk"]\n wandb_host = context.resources.wandb_resource["host"]\n wandb_entity = context.resources.wandb_config["entity"]\n wandb_project = context.resources.wandb_config["project"]\n\n wandb_run_name = None\n wandb_run_id = None\n wandb_run_tags = None\n base_dir = (\n context.instance.storage_directory() if context.instance else os.environ["DAGSTER_HOME"]\n )\n cache_duration_in_minutes = None\n if context.resource_config is not None:\n wandb_run_name = context.resource_config.get("run_name")\n wandb_run_id = context.resource_config.get("run_id")\n wandb_run_tags = context.resource_config.get("run_tags")\n base_dir = context.resource_config.get("base_dir", base_dir)\n cache_duration_in_minutes = context.resource_config.get("cache_duration_in_minutes")\n\n if "PYTEST_CURRENT_TEST" in os.environ:\n dagster_run_id = "unit-testing"\n else:\n dagster_run_id = context.run_id\n\n assert dagster_run_id is not None\n\n config: Config = {\n "dagster_run_id": dagster_run_id,\n "wandb_host": wandb_host,\n "wandb_entity": wandb_entity,\n "wandb_project": wandb_project,\n "wandb_run_name": wandb_run_name,\n "wandb_run_id": wandb_run_id,\n "wandb_run_tags": wandb_run_tags,\n "base_dir": base_dir,\n "cache_duration_in_minutes": cache_duration_in_minutes,\n }\n return ArtifactsIOManager(wandb_client, config)
\n
", "current_page_name": "_modules/dagster_wandb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.io_manager"}, "launch": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.launch.ops

\nfrom dagster import OpExecutionContext, op\nfrom wandb.sdk.launch import launch\nfrom wandb.sdk.launch.launch_add import launch_add\n\nfrom .configs import launch_agent_config, launch_config\n\n\ndef raise_on_invalid_config(context: OpExecutionContext):\n    entity = context.resources.wandb_config["entity"]\n    if entity == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'entity' property of the"\n            " 'wandb_config'."\n        )\n\n    project = context.resources.wandb_config["project"]\n    if project == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'project' property of the"\n            " 'wandb_config'."\n        )\n\n\n
[docs]@op(\n required_resource_keys={"wandb_resource", "wandb_config"},\n config_schema=launch_agent_config(),\n)\ndef run_launch_agent(context: OpExecutionContext):\n """It starts a Launch Agent and runs it as a long running process until stopped manually.\n\n Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\n services to be executed) in order.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n run_launch_agent:\n config:\n max_jobs: -1\n queues:\n - my_dagster_queue\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_agent\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_agent_example():\n run_launch_agent()\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch agent configuration: {config}")\n context.log.info("Running Launch agent...")\n launch.create_and_run_agent(api=context.resources.wandb_resource["api"], config=config)
\n\n\n
[docs]@op(\n required_resource_keys={\n "wandb_resource",\n "wandb_config",\n },\n config_schema=launch_config(),\n)\ndef run_launch_job(context: OpExecutionContext):\n """Executes a Launch job.\n\n A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\n default one. Make sure you have an active agent listening to that queue. You can run an agent\n inside your Dagster instance but can also consider using a deployable agent in Kubernetes.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n my_launched_job:\n config:\n entry_point:\n - python\n - train.py\n queue: my_dagster_queue\n uri: https://github.com/wandb/example-dagster-integration-with-launch\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_job\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_job_example():\n run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch job configuration: {config}")\n\n queue = context.op_config.get("queue")\n if queue is None:\n context.log.info("No queue provided, running Launch job locally")\n launch.run(api=context.resources.wandb_resource["api"], config=config)\n else:\n synchronous = config.get("synchronous", True)\n config.pop("synchronous", None)\n queued_run = launch_add(**config)\n if synchronous is True:\n context.log.info(\n f"Synchronous Launch job added to queue with name={queue}. Waiting for"\n " completion..."\n )\n queued_run.wait_until_finished()\n else:\n context.log.info(f"Asynchronous Launch job added to queue with name={queue}")
\n
", "current_page_name": "_modules/dagster_wandb/launch/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.launch.ops"}}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.resources

\nfrom typing import Any, Dict\n\nimport wandb\nfrom dagster import Field, InitResourceContext, String, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom wandb.sdk.internal.internal_api import Api\n\nWANDB_CLOUD_HOST: str = "https://api.wandb.ai"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n description="W&B API key necessary to communicate with the W&B API.",\n is_required=True,\n ),\n "host": Field(\n String,\n description=(\n "API host server you wish to use. Only required if you are using W&B Server."\n ),\n is_required=False,\n default_value=WANDB_CLOUD_HOST,\n ),\n },\n description="Resource for interacting with Weights & Biases",\n)\ndef wandb_resource(context: InitResourceContext) -> Dict[str, Any]:\n """Dagster resource used to communicate with the W&B API. It's useful when you want to use the\n wandb client within your ops and assets. It's a required resources if you are using the W&B IO\n Manager.\n\n It automatically authenticates using the provided API key.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_wandb import wandb_resource\n\n my_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n @job(resource_defs={"wandb_resource": my_wandb_resource})\n def my_wandb_job():\n ...\n\n """\n api_key = context.resource_config["api_key"]\n host = context.resource_config["host"]\n wandb.login(\n key=api_key,\n host=host,\n anonymous="never",\n )\n client_settings = wandb.Settings(\n api_key=api_key,\n base_url=host,\n anonymous="never",\n launch=True,\n )\n api = Api(default_settings=client_settings, load_settings=False)\n return {"sdk": wandb, "api": api, "host": host}
\n
", "current_page_name": "_modules/dagster_wandb/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.types

\nimport sys\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\nfrom typing import Any, Dict, List\n\n\n
[docs]class SerializationModule(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking."""\n\n name: str\n parameters: Dict[str, Any]
\n\n\n
[docs]class WandbArtifactConfiguration(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration. Useful for type checking."""\n\n name: str\n type: str\n description: str\n aliases: List[str]\n add_dirs: List[Dict[str, Any]]\n add_files: List[Dict[str, Any]]\n add_references: List[Dict[str, Any]]\n serialization_module: SerializationModule\n partitions: Dict[str, Dict[str, Any]]
\n
", "current_page_name": "_modules/dagster_wandb/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.types"}, "utils": {"errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.utils.errors

\n
[docs]class WandbArtifactsIOManagerError(Exception):\n """Represents an execution error of the W&B Artifacts IO Manager."""\n\n def __init__(self, message="A W&B Artifacts IO Manager error occurred."):\n self.message = message\n super().__init__(self.message)
\n\n\nSUPPORTED_READ_CONFIG_KEYS = [\n "alias",\n "get_path",\n "get",\n "name",\n "partitions",\n "version",\n]\nSUPPORTED_WRITE_CONFIG_KEYS = [\n "add_dirs",\n "add_files",\n "add_references",\n "aliases",\n "description",\n "name",\n "partitions",\n "serialization_module",\n "type",\n]\nSUPPORTED_PARTITION_CONFIG_KEYS = ["get", "get_path", "version", "alias"]\n\n\ndef raise_on_empty_configuration(partition_key, dictionary):\n if dictionary is not None and len(dictionary) == 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration is empty for the partition identified by the key '{partition_key}'."\n " This happened within the 'wandb_artifact_configuration' metadata dictionary."\n )\n\n\ndef raise_on_unknown_keys(supported_config_keys, dictionary, is_read_config):\n if dictionary is None:\n return\n\n unsupported_keys = [key for key in dictionary.keys() if key not in supported_config_keys]\n if len(unsupported_keys) > 0:\n if is_read_config:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " reading an Artifact."\n )\n else:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " writing an Artifact."\n )\n\n\ndef raise_on_unknown_write_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_WRITE_CONFIG_KEYS, dictionary, False)\n\n\ndef raise_on_unknown_read_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_READ_CONFIG_KEYS, dictionary, True)\n\n\ndef raise_on_unknown_partition_keys(partition_key, dictionary):\n if dictionary is None:\n return\n\n unsupported_keys = [\n key for key in dictionary.keys() if key not in SUPPORTED_PARTITION_CONFIG_KEYS\n ]\n if len(unsupported_keys) > 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not supported"\n f" for the partition identified by the key '{partition_key}'. This happened within the"\n " 'wandb_artifact_configuration' metadata dictionary."\n )\n
", "current_page_name": "_modules/dagster_wandb/utils/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.utils.errors"}}}, "dagstermill": {"asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.asset_factory

\nimport pickle\nimport tempfile\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Set, Type, Union, cast\n\nimport dagster._check as check\nfrom dagster import (\n    AssetIn,\n    AssetKey,\n    AssetsDefinition,\n    Failure,\n    Output,\n    PartitionsDefinition,\n    ResourceDefinition,\n    RetryPolicy,\n    RetryRequested,\n    SourceAsset,\n    asset,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.type_check_utils import safe_is_subclass\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom dagstermill.factory import _clean_path_for_windows, execute_notebook\n\n\ndef _make_dagstermill_asset_compute_fn(\n    name: str,\n    notebook_path: str,\n    save_notebook_on_failure: bool,\n) -> Callable:\n    def _t_fn(context: OpExecutionContext, **inputs) -> Iterable:\n        check.param_invariant(\n            isinstance(context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                context.get_step_execution_context(),\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            with open(executed_notebook_path, "rb") as fd:\n                yield Output(fd.read())\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_asset(\n name: str,\n notebook_path: str,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n save_notebook_on_failure: bool = False,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> AssetsDefinition:\n """Creates a Dagster asset for a Jupyter notebook.\n\n Arguments:\n name (str): The name for the asset\n notebook_path (str): Path to the backing notebook\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]): The assets\n that are upstream dependencies, but do not pass an input value to the notebook.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the notebook.\n description (Optional[str]): Description of the asset to display in the Dagster UI.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n context within the notebook.\n io_manager_key (Optional[str]): A string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are\n upstream dependencies, but do not pass an input to the asset.\n\n Examples:\n .. code-block:: python\n\n from dagstermill import define_dagstermill_asset\n from dagster import asset, AssetIn, AssetKey\n from sklearn import datasets\n import pandas as pd\n import numpy as np\n\n @asset\n def iris_dataset():\n sk_iris = datasets.load_iris()\n return pd.DataFrame(\n data=np.c_[sk_iris["data"], sk_iris["target"]],\n columns=sk_iris["feature_names"] + ["target"],\n )\n\n iris_kmeans_notebook = define_dagstermill_asset(\n name="iris_kmeans_notebook",\n notebook_path="/path/to/iris_kmeans.ipynb",\n ins={\n "iris": AssetIn(key=AssetKey("iris_dataset"))\n }\n )\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=AssetIn)\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n key_prefix = check.opt_list_param(key_prefix, "key_prefix", of_type=str)\n\n default_description = f"This asset is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n\n user_tags = validate_tags(op_tags)\n if op_tags is not None:\n check.invariant(\n "notebook_path" not in op_tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in op_tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return asset(\n name=name,\n key_prefix=key_prefix,\n ins=ins,\n deps=deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n partitions_def=partitions_def,\n op_tags={**user_tags, **default_tags},\n group_name=group_name,\n output_required=False,\n io_manager_key=io_mgr_key,\n retry_policy=retry_policy,\n non_argument_deps=non_argument_deps,\n )(\n _make_dagstermill_asset_compute_fn(\n name=name,\n notebook_path=notebook_path,\n save_notebook_on_failure=save_notebook_on_failure,\n )\n )
\n
", "current_page_name": "_modules/dagstermill/asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.asset_factory"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.context

\nfrom typing import AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    DagsterRun,\n    JobDefinition,\n    OpDefinition,\n    _check as check,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster._core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._job_context = check.inst_param(job_context, "job_context", PlanExecutionContext)\n self._job_def = check.inst_param(job_def, "job_def", JobDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.op_name = check.str_param(op_name, "op_name")\n self.node_handle = check.inst_param(node_handle, "node_handle", NodeHandle)\n self._op_config = op_config\n\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._job_context.has_tag(key)\n\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._job_context.get_tag(key)\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._job_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, Any]:\n """dict: The run_config for the context."""\n return self._job_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context."""\n return self._job_context.resolved_run_config\n\n @public\n @property\n def logging_tags(self) -> Mapping[str, str]:\n """dict: The logging tags for the context."""\n return self._job_context.logging_tags\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the executing job."""\n return self._job_context.job_name\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """:class:`dagster.JobDefinition`: The job definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._job_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources.\n """\n return self._job_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @public\n @property\n def run(self) -> DagsterRun:\n """:class:`dagster.DagsterRun`: The job run for the context."""\n return cast(DagsterRun, self._job_context.dagster_run)\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._job_context.log\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """:class:`dagster.OpDefinition`: The op definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return cast(OpDefinition, self._job_def.node_def_named(self.op_name))\n\n @property\n def node(self) -> Node:\n """:class:`dagster.Node`: The node for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return self.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n op-specific config.\n """\n if self._op_config:\n return self._op_config\n\n op_config = self.resolved_run_config.ops.get(self.op_name)\n return op_config.config if op_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n step_context: StepExecutionContext,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n job_context,\n job_def,\n resource_keys_to_init,\n op_name,\n node_handle,\n op_config,\n )\n\n @property\n def step_context(self) -> StepExecutionContext:\n return self._step_context\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.errors

\nfrom dagster._core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Set, Type, Union, cast\n\nimport nbformat\nimport papermill\nfrom dagster import (\n    In,\n    OpDefinition,\n    Out,\n    Output,\n    _check as check,\n    _seven,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.type_check_utils import safe_is_subclass\nfrom dagster._core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._serdes import pack_value\nfrom dagster._seven import get_system_temp_directory\nfrom dagster._utils import mkdir_p, safe_tempfile_path\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\ndef _clean_path_for_windows(notebook_path: str) -> str:\n    """In windows, the notebook can't render in the Dagster UI unless the C: prefix is removed.\n    os.path.splitdrive will split the path into (drive, tail), so just return the tail.\n    """\n    return os.path.splitdrive(notebook_path)[1]\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropriate place in the input notebook.\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n            "cell."\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = _seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(\n    step_context: StepExecutionContext,\n    inputs: Mapping[str, object],\n    output_log_path: str,\n    compute_descriptor: str,\n) -> Mapping[str, object]:\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.job, ReconstructableJob):\n        if compute_descriptor == "asset":\n            raise DagstermillError(\n                "Can't execute a dagstermill asset that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.job.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_node_handle_kwargs = step_context.node_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.dagster_run)\n    parameters["__dm_node_handle_kwargs"] = dm_node_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef execute_notebook(\n    step_context: StepExecutionContext,\n    name: str,\n    save_notebook_on_failure: bool,\n    notebook_path: str,\n    output_notebook_dir: str,\n    inputs: Mapping[str, object],\n) -> str:\n    with safe_tempfile_path() as output_log_path:\n        prefix = str(uuid.uuid4())\n        parameterized_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-inter.ipynb")\n\n        executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n        # Scaffold the registration here\n        nb = load_notebook_node(notebook_path)\n        compute_descriptor = "op"\n        nb_no_parameters = replace_parameters(\n            step_context,\n            nb,\n            get_papermill_parameters(\n                step_context,\n                inputs,\n                output_log_path,\n                compute_descriptor,\n            ),\n        )\n        write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n        try:\n            papermill_engines.register("dagstermill", DagstermillEngine)\n            papermill.execute_notebook(\n                input_path=parameterized_notebook_path,\n                output_path=executed_notebook_path,\n                engine_name="dagstermill",\n                log_output=True,\n            )\n\n        except Exception as ex:\n            step_context.log.warn(\n                "Error when attempting to materialize executed notebook: {exc}".format(\n                    exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                )\n            )\n\n            if isinstance(ex, ExecutionError):\n                exception_name = ex.ename  # type: ignore\n                if exception_name in ["RetryRequested", "Failure"]:\n                    step_context.log.warn(\n                        f"Encountered raised {exception_name} in notebook. Use"\n                        " dagstermill.yield_event with RetryRequested or Failure to trigger"\n                        " their behavior."\n                    )\n\n            if save_notebook_on_failure:\n                storage_dir = step_context.instance.storage_directory()\n                storage_path = os.path.join(storage_dir, f"{prefix}-out.ipynb")\n                with open(storage_path, "wb") as dest_file_obj:\n                    with open(executed_notebook_path, "rb") as obj:\n                        dest_file_obj.write(obj.read())\n\n                step_context.log.info(f"Failed notebook written to {storage_path}")\n\n            raise\n\n    step_context.log.debug(f"Notebook execution complete for {name} at {executed_notebook_path}.")\n\n    return executed_notebook_path\n\n\ndef _handle_events_from_notebook(\n    step_context: StepExecutionContext, executed_notebook_path: str\n) -> Iterable:\n    # deferred import for perf\n    import scrapbook\n\n    output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n    for output_name in step_context.op_def.output_dict.keys():\n        data_dict = output_nb.scraps.data_dict\n        if output_name in data_dict:\n            # read outputs that were passed out of process via io manager from `yield_result`\n            step_output_handle = StepOutputHandle(\n                step_key=step_context.step.key,\n                output_name=output_name,\n            )\n            output_context = step_context.get_output_context(step_output_handle)\n            io_manager = step_context.get_io_manager(step_output_handle)\n            value = io_manager.load_input(\n                build_input_context(\n                    upstream_output=output_context, dagster_type=output_context.dagster_type\n                )\n            )\n\n            yield Output(value, output_name)\n\n    for key, value in output_nb.scraps.items():\n        if key.startswith("event-"):\n            with open(value.data, "rb") as fd:\n                event = pickle.loads(fd.read())\n                if isinstance(event, (Failure, RetryRequested)):\n                    raise event\n                else:\n                    yield event\n\n\ndef _make_dagstermill_compute_fn(\n    dagster_factory_name: str,\n    name: str,\n    notebook_path: str,\n    output_notebook_name: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    output_notebook: Optional[str] = None,\n    save_notebook_on_failure: bool = False,\n) -> Callable:\n    def _t_fn(op_context: OpExecutionContext, inputs: Mapping[str, object]) -> Iterable:\n        check.param_invariant(\n            isinstance(op_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_context = op_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                step_context,\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as an op output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = op_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=[*(asset_key_prefix or []), f"{name}_output_notebook"],\n                        description="Location of output notebook in file manager",\n                        metadata={\n                            "path": MetadataValue.path(executed_notebook_materialization_path),\n                        },\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    op_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file"\n                        " manager:"\n                        f" {serializable_error_info_from_exc_info(sys.exc_info())}\\nNow"\n                        " falling back to local: notebook execution was temporarily materialized"\n                        f" at {executed_notebook_path}\\nIf you have supplied a file manager and"\n                        " expect to use it for materializing the notebook, please include"\n                        ' "file_manager" in the `required_resource_keys` argument to'\n                        f" `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            yield from _handle_events_from_notebook(step_context, executed_notebook_path)\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[Sequence[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n io_manager_key: Optional[str] = None,\n save_notebook_on_failure: bool = False,\n) -> OpDefinition:\n """Wrap a Jupyter notebook in a op.\n\n Arguments:\n name (str): The name of the op.\n notebook_path (str): Path to the backing notebook.\n ins (Optional[Mapping[str, In]]): The op's inputs.\n outs (Optional[Mapping[str, Out]]): The op's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream ops to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for op.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate op.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n io_manager_key (Optional[str]): If using output_notebook_name, you can additionally provide\n a string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n\n Returns:\n :py:class:`~dagster.OpDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n outs = check.opt_mapping_param(outs, "outs", key_type=str, value_type=Out)\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=In)\n\n if output_notebook_name is not None:\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n required_resource_keys.add(io_mgr_key)\n outs = {\n **outs,\n cast(str, output_notebook_name): Out(io_manager_key=io_mgr_key),\n }\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return OpDefinition(\n name=name,\n compute_fn=_make_dagstermill_compute_fn(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n save_notebook_on_failure=save_notebook_on_failure,\n ),\n ins=ins,\n outs=outs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    ConfigurableIOManagerFactory,\n    InitResourceContext,\n    IOManager,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager, io_manager\nfrom dagster._utils import mkdir_p\nfrom pydantic import Field\n\nfrom dagstermill.factory import _clean_path_for_windows\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[Sequence[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[Sequence[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        if context.has_asset_key:\n            keys = context.get_asset_identifier()\n        else:\n            keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes."""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n\n        metadata = {\n            "Executed notebook": MetadataValue.notebook(\n                _clean_path_for_windows(output_notebook_path)\n            )\n        }\n\n        if context.has_asset_key:\n            context.add_output_metadata(metadata)\n        else:\n            context.log_event(\n                AssetMaterialization(\n                    asset_key=AssetKey(\n                        [*self.asset_key_prefix, f"{context.step_key}_output_notebook"]\n                    ),\n                    metadata=metadata,\n                )\n            )\n\n    def load_input(self, context: InputContext) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream ops as File Object\n        output_context = check.not_none(context.upstream_output)\n        with open(self._get_path(output_context), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]class ConfigurableLocalOutputNotebookIOManager(ConfigurableIOManagerFactory):\n """Built-in IO Manager for handling output notebook."""\n\n base_dir: Optional[str] = Field(\n default=None,\n description=(\n "Base directory to use for output notebooks. Defaults to the Dagster instance storage"\n " directory if not provided."\n ),\n )\n asset_key_prefix: List[str] = Field(\n default=[],\n description=(\n "Asset key prefix to apply to assets materialized for output notebooks. Defaults to no"\n " prefix."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "LocalOutputNotebookIOManager":\n return LocalOutputNotebookIOManager(\n base_dir=self.base_dir or check.not_none(context.instance).storage_directory(),\n asset_key_prefix=self.asset_key_prefix,\n )
\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema=ConfigurableLocalOutputNotebookIOManager.to_config_schema())\ndef local_output_notebook_io_manager(init_context) -> LocalOutputNotebookIOManager:\n """Built-in IO Manager that handles output notebooks."""\n return ConfigurableLocalOutputNotebookIOManager.from_resource_context(init_context)\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.io_managers"}, "manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.manager

\nimport os\nimport pickle\nimport uuid\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Failure,\n    LoggerDefinition,\n    ResourceDefinition,\n    StepExecutionContext,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._core.definitions.dependency import NodeHandle\nfrom dagster._core.definitions.events import RetryRequested\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, scoped_job_context\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.resources_init import (\n    get_required_resource_keys_to_init,\n    resource_initialization_event_generator,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig, ResourceConfig\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._loggers import colored_console_logger\nfrom dagster._serdes import unpack_value\nfrom dagster._utils import EventGenerationManager\n\nfrom .context import DagstermillExecutionContext, DagstermillRuntimeExecutionContext\nfrom .errors import DagstermillError\nfrom .serialize import PICKLE_PROTOCOL\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n\n\nclass DagstermillResourceEventGenerationManager(EventGenerationManager):\n    """Utility class to explicitly manage setup/teardown of resource events. Overrides the default\n    `generate_teardown_events` method so that teardown is deferred until explicitly called by the\n    dagstermill Manager.\n    """\n\n    def generate_teardown_events(self):\n        return iter(())\n\n    def teardown(self):\n        return [\n            teardown_event\n            for teardown_event in super(\n                DagstermillResourceEventGenerationManager, self\n            ).generate_teardown_events()\n        ]\n\n\nclass Manager:\n    def __init__(self):\n        self.job = None\n        self.op_def: Optional[NodeDefinition] = None\n        self.in_job: bool = False\n        self.marshal_dir: Optional[str] = None\n        self.context = None\n        self.resource_manager = None\n\n    def _setup_resources(\n        self,\n        resource_defs: Mapping[str, ResourceDefinition],\n        resource_configs: Mapping[str, ResourceConfig],\n        log_manager: DagsterLogManager,\n        execution_plan: Optional[ExecutionPlan],\n        dagster_run: Optional[DagsterRun],\n        resource_keys_to_init: Optional[AbstractSet[str]],\n        instance: Optional[DagsterInstance],\n        emit_persistent_events: Optional[bool],\n    ):\n        """Drop-in replacement for\n        `dagster._core.execution.resources_init.resource_initialization_manager`.  It uses a\n        `DagstermillResourceEventGenerationManager` and explicitly calls `teardown` on it.\n        """\n        generator = resource_initialization_event_generator(\n            resource_defs=resource_defs,\n            resource_configs=resource_configs,\n            log_manager=log_manager,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            resource_keys_to_init=resource_keys_to_init,\n            instance=instance,\n            emit_persistent_events=emit_persistent_events,\n        )\n        self.resource_manager = DagstermillResourceEventGenerationManager(\n            generator, ScopedResourcesBuilder\n        )\n        return self.resource_manager\n\n    def reconstitute_job_context(\n        self,\n        executable_dict: Mapping[str, Any],\n        job_run_dict: Mapping[str, Any],\n        node_handle_kwargs: Mapping[str, Any],\n        instance_ref_dict: Mapping[str, Any],\n        step_key: str,\n        output_log_path: Optional[str] = None,\n        marshal_dir: Optional[str] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n    ):\n        """Reconstitutes a context for dagstermill-managed execution.\n\n        You'll see this function called to reconstruct a job context within the ``injected\n        parameters`` cell of a dagstermill output notebook. Users should not call this function\n        interactively except when debugging output notebooks.\n\n        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a\n        context for interactive exploration and development. This call will be replaced by one to\n        :func:`dagstermill.reconstitute_job_context` when the notebook is executed by\n        dagstermill.\n        """\n        check.opt_str_param(output_log_path, "output_log_path")\n        check.opt_str_param(marshal_dir, "marshal_dir")\n        run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n        check.mapping_param(job_run_dict, "job_run_dict")\n        check.mapping_param(executable_dict, "executable_dict")\n        check.mapping_param(node_handle_kwargs, "node_handle_kwargs")\n        check.mapping_param(instance_ref_dict, "instance_ref_dict")\n        check.str_param(step_key, "step_key")\n\n        job = ReconstructableJob.from_dict(executable_dict)\n        job_def = job.get_definition()\n\n        try:\n            instance_ref = unpack_value(instance_ref_dict, InstanceRef)\n            instance = DagsterInstance.from_ref(instance_ref)\n        except Exception as err:\n            raise DagstermillError(\n                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"\n            ) from err\n\n        dagster_run = unpack_value(job_run_dict, DagsterRun)\n\n        node_handle = NodeHandle.from_dict(node_handle_kwargs)\n        op = job_def.get_node(node_handle)\n        op_def = op.definition\n\n        self.marshal_dir = marshal_dir\n        self.in_job = True\n        self.op_def = op_def\n        self.job = job\n\n        ResolvedRunConfig.build(job_def, run_config)\n\n        execution_plan = create_execution_plan(\n            self.job,\n            run_config,\n            step_keys_to_execute=dagster_run.step_keys_to_execute,\n        )\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            instance,\n            scoped_resources_builder_cm=self._setup_resources,\n            # Set this flag even though we're not in test for clearer error reporting\n            raise_on_error=True,\n        ) as job_context:\n            known_state = None\n            if dagster_run.parent_run_id:\n                known_state = KnownExecutionState.build_for_reexecution(\n                    instance=instance,\n                    parent_run=check.not_none(instance.get_run_by_id(dagster_run.parent_run_id)),\n                )\n            self.context = DagstermillRuntimeExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=run_config.get("ops", {}).get(op.name, {}).get("config"),\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op.name,\n                node_handle=node_handle,\n                step_context=cast(\n                    StepExecutionContext,\n                    job_context.for_step(\n                        cast(ExecutionStep, execution_plan.get_step_by_key(step_key)),\n                        known_state=known_state,\n                    ),\n                ),\n            )\n\n        return self.context\n\n    def get_context(\n        self,\n        op_config: Any = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        run_config: Optional[dict] = None,\n    ) -> DagstermillExecutionContext:\n        """Get a dagstermill execution context for interactive exploration and development.\n\n        Args:\n            op_config (Optional[Any]): If specified, this value will be made available on the\n                context as its ``op_config`` property.\n            resource_defs (Optional[Mapping[str, ResourceDefinition]]): Specifies resources to provide to context.\n            logger_defs (Optional[Mapping[str, LoggerDefinition]]): Specifies loggers to provide to context.\n            run_config(Optional[dict]): The config dict with which to construct\n                the context.\n\n        Returns:\n            :py:class:`~dagstermill.DagstermillExecutionContext`\n        """\n        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n        # If we are running non-interactively, and there is already a context reconstituted, return\n        # that context rather than overwriting it.\n        if self.context is not None and isinstance(\n            self.context, DagstermillRuntimeExecutionContext\n        ):\n            return self.context\n\n        if not logger_defs:\n            logger_defs = {"dagstermill": colored_console_logger}\n            run_config["loggers"] = {"dagstermill": {}}\n        logger_defs = check.opt_mapping_param(logger_defs, "logger_defs")\n        resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n\n        op_def = OpDefinition(\n            name="this_op",\n            compute_fn=lambda *args, **kwargs: None,\n            description="Ephemeral op constructed by dagstermill.get_context()",\n            required_resource_keys=set(resource_defs.keys()),\n        )\n\n        job_def = JobDefinition(\n            graph_def=GraphDefinition(name="ephemeral_dagstermill_pipeline", node_defs=[op_def]),\n            logger_defs=logger_defs,\n            resource_defs=resource_defs,\n        )\n\n        run_id = make_new_run_id()\n\n        # construct stubbed DagsterRun for notebook exploration...\n        # The actual dagster run during job execution will be serialized and reconstituted\n        # in the `reconstitute_job_context` call\n        dagster_run = DagsterRun(\n            job_name=job_def.name,\n            run_id=run_id,\n            run_config=run_config,\n            step_keys_to_execute=None,\n            status=DagsterRunStatus.NOT_STARTED,\n            tags=None,\n        )\n\n        self.in_job = False\n        self.op_def = op_def\n        self.job = job_def\n\n        job = InMemoryJob(job_def)\n        execution_plan = create_execution_plan(job, run_config)\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            DagsterInstance.ephemeral(),\n            scoped_resources_builder_cm=self._setup_resources,\n        ) as job_context:\n            self.context = DagstermillExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=op_config,\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op_def.name,\n                node_handle=NodeHandle(op_def.name, parent=None),\n            )\n\n        return self.context\n\n    def yield_result(self, value, output_name="result"):\n        """Yield a result directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            value (Any): The value to yield.\n            output_name (Optional[str]): The name of the result to yield (default: ``'result'``).\n        """\n        if not self.in_job:\n            return value\n\n        # deferred import for perf\n        import scrapbook\n\n        if not self.op_def.has_output(output_name):\n            raise DagstermillError(\n                f"Op {self.op_def.name} does not have output named {output_name}.Expected one of"\n                f" {[str(output_def.name) for output_def in self.op_def.output_defs]}"\n            )\n\n        # pass output value cross process boundary using io manager\n        step_context = self.context._step_context  # noqa: SLF001\n        # Note: yield_result currently does not support DynamicOutput\n\n        # dagstermill assets do not support yielding additional results within the notebook:\n        if len(step_context.job_def.asset_layer.asset_keys) > 0:\n            raise DagstermillError(\n                "dagstermill assets do not currently support dagstermill.yield_result"\n            )\n\n        step_output_handle = StepOutputHandle(\n            step_key=step_context.step.key, output_name=output_name\n        )\n        output_context = step_context.get_output_context(step_output_handle)\n        io_manager = step_context.get_io_manager(step_output_handle)\n\n        # Note that we assume io manager is symmetric, i.e handle_input(handle_output(X)) == X\n        io_manager.handle_output(output_context, value)\n\n        # record that the output has been yielded\n        scrapbook.glue(output_name, "")\n\n    def yield_event(self, dagster_event):\n        """Yield a dagster event directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            dagster_event (Union[:class:`dagster.AssetMaterialization`, :class:`dagster.ExpectationResult`, :class:`dagster.TypeCheck`, :class:`dagster.Failure`, :class:`dagster.RetryRequested`]):\n                An event to yield back to Dagster.\n        """\n        valid_types = (\n            AssetMaterialization,\n            AssetObservation,\n            ExpectationResult,\n            TypeCheck,\n            Failure,\n            RetryRequested,\n        )\n        if not isinstance(dagster_event, valid_types):\n            raise DagstermillError(\n                f"Received invalid type {dagster_event} in yield_event. Expected a Dagster event"\n                f" type, one of {valid_types}."\n            )\n\n        if not self.in_job:\n            return dagster_event\n\n        # deferred import for perf\n        import scrapbook\n\n        event_id = f"event-{uuid.uuid4()}"\n        out_file_path = os.path.join(self.marshal_dir, event_id)\n        with open(out_file_path, "wb") as fd:\n            fd.write(pickle.dumps(dagster_event, PICKLE_PROTOCOL))\n\n        scrapbook.glue(event_id, out_file_path)\n\n    def teardown_resources(self):\n        if self.resource_manager is not None:\n            self.resource_manager.teardown()\n\n    def load_input_parameter(self, input_name: str):\n        # load input from source\n        dm_context = check.not_none(self.context)\n        if not isinstance(dm_context, DagstermillRuntimeExecutionContext):\n            check.failed("Expected DagstermillRuntimeExecutionContext")\n        step_context = dm_context.step_context\n        step_input = step_context.step.step_input_named(input_name)\n        input_def = step_context.op_def.input_def_named(input_name)\n        for event_or_input_value in step_input.source.load_input_object(step_context, input_def):\n            if isinstance(event_or_input_value, DagsterEvent):\n                continue\n            else:\n                return event_or_input_value\n\n\nMANAGER_FOR_NOTEBOOK_INSTANCE = Manager()\n
", "current_page_name": "_modules/dagstermill/manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.manager"}}} \ No newline at end of file diff --git a/docs/content/integrations/embedded-elt.mdx b/docs/content/integrations/embedded-elt.mdx index d5aff395a480c..74ad4cef0181d 100644 --- a/docs/content/integrations/embedded-elt.mdx +++ b/docs/content/integrations/embedded-elt.mdx @@ -50,18 +50,19 @@ sling = SlingResource(source_connection=source, ...) Note that no path is required in the source connection, as that is provided by the asset itself. -````python +```python asset_def = build_sling_asset( asset_spec=AssetSpec("my_file"), source_stream=f"file://{path_to_file}", ... ) +``` For database connections, you can provide a connection string or a dictionary of keyword arguments. For example, to connect to a SQLite database, you can provide a path to the database using the `instance` keyword, which is specified in [Sling's SQLite connection](https://docs.slingdata.io/connections/database-connections/sqlite) documentation. ```python source = SlingSourceConnection(type="sqlite", instance="path/to/sqlite.db") -```` +``` --- @@ -107,29 +108,64 @@ sling_job = build_assets_job( This is an example of how to setup a Sling sync between Postgres and Snowflake: -```python +```python file=/integrations/embedded_elt/postgres_snowflake.py import os -from dagster_embedded_elt.sling import SlingResource, SlingSourceConnection, SlingTargetConnection + +from dagster_embedded_elt.sling import ( + SlingMode, + SlingResource, + SlingSourceConnection, + SlingTargetConnection, + build_sling_asset, +) + +from dagster import AssetSpec source = SlingSourceConnection( - type="postgres", host="localhost", port=5432, database="my_database", - user="my_user", password=os.getenv("PG_PASS") + type="postgres", + host="localhost", + port=5432, + database="my_database", + user="my_user", + password=os.getenv("PG_PASS"), ) target = SlingTargetConnection( - type="snowflake", host="hostname.snowflake", user="username", - database="database", password=os.getenv("SF_PASSWORD"), role="role" + type="snowflake", + host="hostname.snowflake", + user="username", + database="database", + password=os.getenv("SF_PASSWORD"), + role="role", ) sling = SlingResource(source_connection=source, target_connection=target) + +asset_def = build_sling_asset( + asset_spec=AssetSpec("my_asset_name"), + source_stream="public.my_table", + target_object="marts.my_table", + mode=SlingMode.INCREMENTAL, + primary_key="id", +) ``` Similarily, you can define file/storage connections: -```python +```python startafter=start_storage_config endbefore=end_storage_config file=/integrations/embedded_elt/s3_snowflake.py source = SlingSourceConnection( - type="s3", bucket="sling-bucket", + type="s3", + bucket="sling-bucket", access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY") + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), +) + +sling = SlingResource(source_connection=source, target_connection=target) + +asset_def = build_sling_asset( + asset_spec=AssetSpec("my_asset_name"), + source_stream="s3://my-bucket/my_file.parquet", + target_object="marts.my_table", + primary_key="id", ) ``` diff --git a/examples/docs_snippets/docs_snippets/integrations/embedded_elt/__init__.py b/examples/docs_snippets/docs_snippets/integrations/embedded_elt/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/docs_snippets/docs_snippets/integrations/embedded_elt/postgres_snowflake.py b/examples/docs_snippets/docs_snippets/integrations/embedded_elt/postgres_snowflake.py new file mode 100644 index 0000000000000..4f0e756b5ab07 --- /dev/null +++ b/examples/docs_snippets/docs_snippets/integrations/embedded_elt/postgres_snowflake.py @@ -0,0 +1,42 @@ +# pyright: reportGeneralTypeIssues=none +# pyright: reportOptionalMemberAccess=none + +import os + +from dagster_embedded_elt.sling import ( + SlingMode, + SlingResource, + SlingSourceConnection, + SlingTargetConnection, + build_sling_asset, +) + +from dagster import AssetSpec + +source = SlingSourceConnection( + type="postgres", + host="localhost", + port=5432, + database="my_database", + user="my_user", + password=os.getenv("PG_PASS"), +) + +target = SlingTargetConnection( + type="snowflake", + host="hostname.snowflake", + user="username", + database="database", + password=os.getenv("SF_PASSWORD"), + role="role", +) + +sling = SlingResource(source_connection=source, target_connection=target) + +asset_def = build_sling_asset( + asset_spec=AssetSpec("my_asset_name"), + source_stream="public.my_table", + target_object="marts.my_table", + mode=SlingMode.INCREMENTAL, + primary_key="id", +) diff --git a/examples/docs_snippets/docs_snippets/integrations/embedded_elt/s3_snowflake.py b/examples/docs_snippets/docs_snippets/integrations/embedded_elt/s3_snowflake.py new file mode 100644 index 0000000000000..99e0dc7bde8a2 --- /dev/null +++ b/examples/docs_snippets/docs_snippets/integrations/embedded_elt/s3_snowflake.py @@ -0,0 +1,42 @@ +# pyright: reportGeneralTypeIssues=none +# pyright: reportOptionalMemberAccess=none + +import os + +from dagster_embedded_elt.sling import ( + SlingResource, + SlingSourceConnection, + SlingTargetConnection, + build_sling_asset, +) + +from dagster import AssetSpec + +target = SlingTargetConnection( + type="snowflake", + host="hostname.snowflake", + user="username", + database="database", + password=os.getenv("SF_PASSWORD"), + role="role", +) + + +# start_storage_config +source = SlingSourceConnection( + type="s3", + bucket="sling-bucket", + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), +) + +sling = SlingResource(source_connection=source, target_connection=target) + +asset_def = build_sling_asset( + asset_spec=AssetSpec("my_asset_name"), + source_stream="s3://my-bucket/my_file.parquet", + target_object="marts.my_table", + primary_key="id", +) + +# end_storage_config diff --git a/examples/docs_snippets/docs_snippets_tests/integrations_tests/test_embedded_elt.py b/examples/docs_snippets/docs_snippets_tests/integrations_tests/test_embedded_elt.py new file mode 100644 index 0000000000000..8dde56f5a2e43 --- /dev/null +++ b/examples/docs_snippets/docs_snippets_tests/integrations_tests/test_embedded_elt.py @@ -0,0 +1,11 @@ +from docs_snippets.integrations.embedded_elt.postgres_snowflake import ( + asset_def as asset_def_postgres, +) +from docs_snippets.integrations.embedded_elt.s3_snowflake import ( + asset_def as asset_def_s3, +) + + +def test_asset_defs(): + assert asset_def_postgres + assert asset_def_s3 diff --git a/examples/docs_snippets/setup.py b/examples/docs_snippets/setup.py index d93e94694fcd8..468b39155a8c7 100755 --- a/examples/docs_snippets/setup.py +++ b/examples/docs_snippets/setup.py @@ -26,6 +26,7 @@ "dagster-dask", "dagster-duckdb", "dagster-duckdb-pandas", + "dagster-embedded-elt", "dagster-fivetran", "dagster-gcp", "dagster-graphql", diff --git a/examples/docs_snippets/tox.ini b/examples/docs_snippets/tox.ini index afe90915da64d..0a8256c0f34b5 100644 --- a/examples/docs_snippets/tox.ini +++ b/examples/docs_snippets/tox.ini @@ -18,6 +18,7 @@ deps = -e ../../python_modules/libraries/dagster-dask -e ../../python_modules/libraries/dagster-duckdb -e ../../python_modules/libraries/dagster-duckdb-pandas + -e ../../python_modules/libraries/dagster-embedded-elt -e ../../python_modules/libraries/dagster-fivetran -e ../../python_modules/libraries/dagster-gcp -e ../../python_modules/libraries/dagster-k8s diff --git a/examples/project_fully_featured/dbt_project/target/manifest.json b/examples/project_fully_featured/dbt_project/target/manifest.json index ab1d994f906e9..c0884d48e70e2 100644 --- a/examples/project_fully_featured/dbt_project/target/manifest.json +++ b/examples/project_fully_featured/dbt_project/target/manifest.json @@ -1 +1 @@ -{"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v10.json", "dbt_version": "1.6.1", "generated_at": "2023-09-08T13:56:25.515305Z", "invocation_id": "7ed65073-eadd-4ffd-86e8-065cbaca587f", "env": {}, "project_name": "hacker_news_dbt", "project_id": "822c572c4d9d54bd8df768554843c306", "user_id": "5679ee77-afd5-4af7-ae96-6b5b2f0cea91", "send_anonymous_usage_stats": true, "adapter_type": "duckdb"}, "nodes": {"model.hacker_news_dbt.comment_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "comment_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/comment_daily_stats.sql", "original_file_path": "models/activity_analytics/comment_daily_stats.sql", "unique_id": "model.hacker_news_dbt.comment_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "comment_daily_stats"], "alias": "comment_daily_stats", "checksum": {"name": "sha256", "checksum": "e824d040f11e1a1d2ca71fe34f2088b57a6c2882c92b2ee1682565db253f9e1c"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Summary of comment activity by day", "columns": {"date": {"name": "date", "description": "The date that the stories were posted on.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "num_comments": {"name": "num_comments", "description": "The number of comments posted.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "posting_users": {"name": "posting_users", "description": "The number of unique users who posted a comment.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1694181385.9191082, "relation_name": "\"hackernews\".\"activity_analytics\".\"comment_daily_stats\"", "raw_code": "SELECT date_trunc('day', to_timestamp(time::int)) as date,\n count(DISTINCT user_id) AS commenting_users,\n count(*) AS num_comments\nFROM {{ source('core', 'comments') }}\nGROUP BY 1", "language": "sql", "refs": [], "sources": [["core", "comments"]], "metrics": [], "depends_on": {"macros": [], "nodes": ["source.hacker_news_dbt.core.comments"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "model.hacker_news_dbt.activity_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "activity_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/activity_daily_stats.sql", "original_file_path": "models/activity_analytics/activity_daily_stats.sql", "unique_id": "model.hacker_news_dbt.activity_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "activity_daily_stats"], "alias": "activity_daily_stats", "checksum": {"name": "sha256", "checksum": "3bb7e086ca21de591744100ca30375f1b10b0164b34121f269360af41613a749"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Combined stats about activity on each day", "columns": {}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1694181385.919334, "relation_name": "\"hackernews\".\"activity_analytics\".\"activity_daily_stats\"", "raw_code": "SELECT *\nFROM {{ ref('comment_daily_stats') }}\nFULL OUTER JOIN {{ ref('story_daily_stats') }}\nUSING (date)", "language": "sql", "refs": [{"name": "comment_daily_stats", "package": null, "version": null}, {"name": "story_daily_stats", "package": null, "version": null}], "sources": [], "metrics": [], "depends_on": {"macros": [], "nodes": ["model.hacker_news_dbt.comment_daily_stats", "model.hacker_news_dbt.story_daily_stats"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "model.hacker_news_dbt.story_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "story_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/story_daily_stats.sql", "original_file_path": "models/activity_analytics/story_daily_stats.sql", "unique_id": "model.hacker_news_dbt.story_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "story_daily_stats"], "alias": "story_daily_stats", "checksum": {"name": "sha256", "checksum": "93633f2202f8d705d941c2e61c0fbbd2a436ad106046de121328c6e4205e526f"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Summary of posting activity by day", "columns": {"date": {"name": "date", "description": "The date that the stories were posted on.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "num_stories": {"name": "num_stories", "description": "The number of stories posted.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "posting_users": {"name": "posting_users", "description": "The number of unique users who posted a story.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1694181385.918628, "relation_name": "\"hackernews\".\"activity_analytics\".\"story_daily_stats\"", "raw_code": "SELECT date_trunc('day', to_timestamp(time::int)) as date,\n count(DISTINCT user_id) AS posting_users,\n count(*) AS num_stories\nFROM {{ source('core', 'stories') }}\nGROUP BY 1", "language": "sql", "refs": [], "sources": [["core", "stories"]], "metrics": [], "depends_on": {"macros": [], "nodes": ["source.hacker_news_dbt.core.stories"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "test.hacker_news_dbt.assert_true": {"database": "hackernews", "schema": "dbt_test__audit", "name": "assert_true", "resource_type": "test", "package_name": "hacker_news_dbt", "path": "assert_true.sql", "original_file_path": "tests/assert_true.sql", "unique_id": "test.hacker_news_dbt.assert_true", "fqn": ["hacker_news_dbt", "assert_true"], "alias": "assert_true", "checksum": {"name": "sha256", "checksum": "e0c343d949749324dcbf7013326a3451c98dbcc737c10bf21fc073a259112e6d"}, "config": {"enabled": true, "alias": null, "schema": "dbt_test__audit", "database": null, "tags": [], "meta": {}, "group": null, "materialized": "test", "severity": "ERROR", "store_failures": null, "where": null, "limit": null, "fail_calc": "count(*)", "warn_if": "!= 0", "error_if": "!= 0"}, "tags": [], "description": "", "columns": {}, "meta": {}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": null, "build_path": null, "deferred": false, "unrendered_config": {}, "created_at": 1694181385.895998, "relation_name": null, "raw_code": "SELECT 'Hello, Tests' LIMIT 0", "language": "sql", "refs": [], "sources": [], "metrics": [], "depends_on": {"macros": [], "nodes": []}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}}, "seed.hacker_news_dbt.full_sample": {"database": "hackernews", "schema": "public", "name": "full_sample", "resource_type": "seed", "package_name": "hacker_news_dbt", "path": "full_sample.csv", "original_file_path": "data/full_sample.csv", "unique_id": "seed.hacker_news_dbt.full_sample", "fqn": ["hacker_news_dbt", "full_sample"], "alias": "full_sample", "checksum": {"name": "sha256", "checksum": "40a06e8e17973fbdc477234066f913ca192dd8e14502f88e5d208eaf679c604b"}, "config": {"enabled": true, "alias": null, "schema": null, "database": null, "tags": [], "meta": {}, "group": null, "materialized": "seed", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "quote_columns": false, "post-hook": [], "pre-hook": []}, "tags": [], "description": "", "columns": {}, "meta": {}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": null, "build_path": null, "deferred": false, "unrendered_config": {"quote_columns": false}, "created_at": 1694181385.904223, "relation_name": "\"hackernews\".\"public\".\"full_sample\"", "raw_code": "", "root_path": "dbt_project", "depends_on": {"macros": []}}}, "sources": {"source.hacker_news_dbt.core.comments": {"database": "hackernews", "schema": "core", "name": "comments", "resource_type": "source", "package_name": "hacker_news_dbt", "path": "models/sources.yml", "original_file_path": "models/sources.yml", "unique_id": "source.hacker_news_dbt.core.comments", "fqn": ["hacker_news_dbt", "core", "comments"], "source_name": "core", "source_description": "", "loader": "", "identifier": "comments", "quoting": {"database": null, "schema": null, "identifier": null, "column": null}, "loaded_at_field": null, "freshness": {"warn_after": {"count": null, "period": null}, "error_after": {"count": null, "period": null}, "filter": null}, "external": null, "description": "", "columns": {}, "meta": {}, "source_meta": {}, "tags": [], "config": {"enabled": true}, "patch_path": null, "unrendered_config": {}, "relation_name": "\"hackernews\".\"core\".\"comments\"", "created_at": 1694181385.92353}, "source.hacker_news_dbt.core.stories": {"database": "hackernews", "schema": "core", "name": "stories", "resource_type": "source", "package_name": "hacker_news_dbt", "path": "models/sources.yml", "original_file_path": "models/sources.yml", "unique_id": "source.hacker_news_dbt.core.stories", "fqn": ["hacker_news_dbt", "core", "stories"], "source_name": "core", "source_description": "", "loader": "", "identifier": "stories", "quoting": {"database": null, "schema": null, "identifier": null, "column": null}, "loaded_at_field": null, "freshness": {"warn_after": {"count": null, "period": null}, "error_after": {"count": null, "period": null}, "filter": null}, "external": null, "description": "", "columns": {}, "meta": {}, "source_meta": {}, "tags": [], "config": {"enabled": true}, "patch_path": null, "unrendered_config": {}, "relation_name": "\"hackernews\".\"core\".\"stories\"", "created_at": 1694181385.9236422}}, "macros": {"macro.hacker_news_dbt.aggregate_actions": {"name": "aggregate_actions", "resource_type": "macro", "package_name": "hacker_news_dbt", "path": "macros/aggregate_actions.sql", "original_file_path": "macros/aggregate_actions.sql", "unique_id": "macro.hacker_news_dbt.aggregate_actions", "macro_sql": "{% macro aggregate_actions(table) %}\n SELECT\n COUNT(*) as num_actions,\n \"by\"\n FROM {{ table }}\n WHERE \"by\" IS NOT NULL\n GROUP BY \"by\"\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.528951, "supported_languages": null}, "macro.hacker_news_dbt.generate_schema_name": {"name": "generate_schema_name", "resource_type": "macro", "package_name": "hacker_news_dbt", "path": "macros/generate_schema_name.sql", "original_file_path": "macros/generate_schema_name.sql", "unique_id": "macro.hacker_news_dbt.generate_schema_name", "macro_sql": "{% macro generate_schema_name(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if custom_schema_name is none -%}\n\n public\n\n {%- else -%}\n\n {{ custom_schema_name | trim }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5292978, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_binding_char": {"name": "duckdb__get_binding_char", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_binding_char", "macro_sql": "{% macro duckdb__get_binding_char() %}\n {{ return(adapter.get_binding_char()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5309608, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_batch_size": {"name": "duckdb__get_batch_size", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_batch_size", "macro_sql": "{% macro duckdb__get_batch_size() %}\n {{ return(10000) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.531092, "supported_languages": null}, "macro.dbt_duckdb.duckdb__load_csv_rows": {"name": "duckdb__load_csv_rows", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__load_csv_rows", "macro_sql": "{% macro duckdb__load_csv_rows(model, agate_table) %}\n {% if config.get('fast', true) %}\n {% set seed_file_path = adapter.get_seed_file_path(model) %}\n {% set sql %}\n COPY {{ this.render() }} FROM '{{ seed_file_path }}' (FORMAT CSV, HEADER TRUE)\n {% endset %}\n {% do adapter.add_query(sql, abridge_sql_log=True) %}\n {{ return(sql) }}\n {% endif %}\n\n {% set batch_size = get_batch_size() %}\n {% set agate_table = adapter.convert_datetimes_to_strs(agate_table) %}\n {% set cols_sql = get_seed_column_quoted_csv(model, agate_table.column_names) %}\n {% set bindings = [] %}\n\n {% set statements = [] %}\n\n {% for chunk in agate_table.rows | batch(batch_size) %}\n {% set bindings = [] %}\n\n {% for row in chunk %}\n {% do bindings.extend(row) %}\n {% endfor %}\n\n {% set sql %}\n insert into {{ this.render() }} ({{ cols_sql }}) values\n {% for row in chunk -%}\n ({%- for column in agate_table.column_names -%}\n {{ get_binding_char() }}\n {%- if not loop.last%},{%- endif %}\n {%- endfor -%})\n {%- if not loop.last%},{%- endif %}\n {%- endfor %}\n {% endset %}\n\n {% do adapter.add_query(sql, bindings=bindings, abridge_sql_log=True) %}\n\n {% if loop.index0 == 0 %}\n {% do statements.append(sql) %}\n {% endif %}\n {% endfor %}\n\n {# Return SQL so we can render it out into the compiled files #}\n {{ return(statements[0]) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_batch_size", "macro.dbt.get_seed_column_quoted_csv", "macro.dbt.get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.533161, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_merge_sql": {"name": "duckdb__snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_merge_sql", "macro_sql": "{% macro duckdb__snapshot_merge_sql(target, source, insert_cols) -%}\n {%- set insert_cols_csv = insert_cols | join(', ') -%}\n\n update {{ target }} as DBT_INTERNAL_TARGET\n set dbt_valid_to = DBT_INTERNAL_SOURCE.dbt_valid_to\n from {{ source }} as DBT_INTERNAL_SOURCE\n where DBT_INTERNAL_SOURCE.dbt_scd_id::text = DBT_INTERNAL_TARGET.dbt_scd_id::text\n and DBT_INTERNAL_SOURCE.dbt_change_type::text in ('update'::text, 'delete'::text)\n and DBT_INTERNAL_TARGET.dbt_valid_to is null;\n\n insert into {{ target }} ({{ insert_cols_csv }})\n select {% for column in insert_cols -%}\n DBT_INTERNAL_SOURCE.{{ column }} {%- if not loop.last %}, {%- endif %}\n {%- endfor %}\n from {{ source }} as DBT_INTERNAL_SOURCE\n where DBT_INTERNAL_SOURCE.dbt_change_type::text = 'insert'::text;\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5341089, "supported_languages": null}, "macro.dbt_duckdb.build_snapshot_staging_table": {"name": "build_snapshot_staging_table", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.build_snapshot_staging_table", "macro_sql": "{% macro build_snapshot_staging_table(strategy, sql, target_relation) %}\n {% set temp_relation = make_temp_relation(target_relation) %}\n\n {% set select = snapshot_staging_table(strategy, sql, target_relation) %}\n\n {% call statement('build_snapshot_staging_relation') %}\n {{ create_table_as(False, temp_relation, select) }}\n {% endcall %}\n\n {% do return(temp_relation) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_temp_relation", "macro.dbt.snapshot_staging_table", "macro.dbt.statement", "macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.534571, "supported_languages": null}, "macro.dbt_duckdb.duckdb__post_snapshot": {"name": "duckdb__post_snapshot", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__post_snapshot", "macro_sql": "{% macro duckdb__post_snapshot(staging_relation) %}\n {% do return(drop_relation(staging_relation)) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.drop_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.534734, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_catalog": {"name": "duckdb__get_catalog", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/catalog.sql", "original_file_path": "macros/catalog.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_catalog", "macro_sql": "{% macro duckdb__get_catalog(information_schema, schemas) -%}\n {%- call statement('catalog', fetch_result=True) -%}\n select\n '{{ database }}' as table_database,\n t.table_schema,\n t.table_name,\n t.table_type,\n '' as table_comment,\n c.column_name,\n c.ordinal_position as column_index,\n c.data_type column_type,\n '' as column_comment,\n '' as table_owner\n FROM information_schema.tables t JOIN information_schema.columns c ON t.table_schema = c.table_schema AND t.table_name = c.table_name\n WHERE (\n {%- for schema in schemas -%}\n upper(t.table_schema) = upper('{{ schema }}'){%- if not loop.last %} or {% endif -%}\n {%- endfor -%}\n )\n AND t.table_type IN ('BASE TABLE', 'VIEW')\n ORDER BY\n t.table_schema,\n t.table_name,\n c.ordinal_position\n {%- endcall -%}\n {{ return(load_result('catalog').table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.53543, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_schema": {"name": "duckdb__create_schema", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_schema", "macro_sql": "{% macro duckdb__create_schema(relation) -%}\n {%- call statement('create_schema') -%}\n create schema if not exists {{ relation.without_identifier() }}\n {%- endcall -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.543443, "supported_languages": null}, "macro.dbt_duckdb.duckdb__drop_schema": {"name": "duckdb__drop_schema", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__drop_schema", "macro_sql": "{% macro duckdb__drop_schema(relation) -%}\n {%- call statement('drop_schema') -%}\n drop schema if exists {{ relation.without_identifier() }} cascade\n {%- endcall -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.543616, "supported_languages": null}, "macro.dbt_duckdb.duckdb__list_schemas": {"name": "duckdb__list_schemas", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__list_schemas", "macro_sql": "{% macro duckdb__list_schemas(database) -%}\n {% set sql %}\n select schema_name\n from system.information_schema.schemata\n {% if database is not none %}\n where catalog_name = '{{ database }}'\n {% endif %}\n {% endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5438862, "supported_languages": null}, "macro.dbt_duckdb.duckdb__check_schema_exists": {"name": "duckdb__check_schema_exists", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__check_schema_exists", "macro_sql": "{% macro duckdb__check_schema_exists(information_schema, schema) -%}\n {% set sql -%}\n select count(*)\n from system.information_schema.schemata\n where schema_name = '{{ schema }}'\n and catalog_name = '{{ information_schema.database }}'\n {%- endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5441332, "supported_languages": null}, "macro.dbt_duckdb.get_column_names": {"name": "get_column_names", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.get_column_names", "macro_sql": "{% macro get_column_names() %}\n {# loop through user_provided_columns to get column names #}\n {%- set user_provided_columns = model['columns'] -%}\n (\n {% for i in user_provided_columns %}\n {% set col = user_provided_columns[i] %}\n {{ col['name'] }} {{ \",\" if not loop.last }}\n {% endfor %}\n )\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.544485, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_table_as": {"name": "duckdb__create_table_as", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_table_as", "macro_sql": "{% macro duckdb__create_table_as(temporary, relation, compiled_code, language='sql') -%}\n {%- if language == 'sql' -%}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(compiled_code) }}\n {% endif %}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n\n create {% if temporary: -%}temporary{%- endif %} table\n {{ relation.include(database=(not temporary), schema=(not temporary)) }}\n {% if contract_config.enforced and not temporary %}\n {#-- DuckDB doesnt support constraints on temp tables --#}\n {{ get_table_columns_and_constraints() }} ;\n insert into {{ relation }} {{ get_column_names() }} (\n {{ get_select_subquery(compiled_code) }}\n );\n {% else %}\n as (\n {{ compiled_code }}\n );\n {% endif %}\n {%- elif language == 'python' -%}\n {{ py_write_table(temporary=temporary, relation=relation, compiled_code=compiled_code) }}\n {%- else -%}\n {% do exceptions.raise_compiler_error(\"duckdb__create_table_as macro didn't get supported language, it got %s\" % language) %}\n {%- endif -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent", "macro.dbt.get_table_columns_and_constraints", "macro.dbt_duckdb.get_column_names", "macro.dbt.get_select_subquery", "macro.dbt_duckdb.py_write_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5455098, "supported_languages": null}, "macro.dbt_duckdb.py_write_table": {"name": "py_write_table", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.py_write_table", "macro_sql": "{% macro py_write_table(temporary, relation, compiled_code) -%}\n{{ compiled_code }}\n\ndef materialize(df, con):\n try:\n import pyarrow\n pyarrow_available = True\n except ImportError:\n pyarrow_available = False\n finally:\n if pyarrow_available and isinstance(df, pyarrow.Table):\n # https://github.com/duckdb/duckdb/issues/6584\n import pyarrow.dataset\n con.execute('create table {{ relation }} as select * from df')\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.545671, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_view_as": {"name": "duckdb__create_view_as", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_view_as", "macro_sql": "{% macro duckdb__create_view_as(relation, sql) -%}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(sql) }}\n {%- endif %}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n create view {{ relation }} as (\n {{ sql }}\n );\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.54606, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_columns_in_relation": {"name": "duckdb__get_columns_in_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_columns_in_relation", "macro_sql": "{% macro duckdb__get_columns_in_relation(relation) -%}\n {% call statement('get_columns_in_relation', fetch_result=True) %}\n select\n column_name,\n data_type,\n character_maximum_length,\n numeric_precision,\n numeric_scale\n\n from system.information_schema.columns\n where table_name = '{{ relation.identifier }}'\n {% if relation.schema %}\n and table_schema = '{{ relation.schema }}'\n {% endif %}\n {% if relation.database %}\n and table_catalog = '{{ relation.database }}'\n {% endif %}\n order by ordinal_position\n\n {% endcall %}\n {% set table = load_result('get_columns_in_relation').table %}\n {{ return(sql_convert_columns_in_relation(table)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.sql_convert_columns_in_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.546537, "supported_languages": null}, "macro.dbt_duckdb.duckdb__list_relations_without_caching": {"name": "duckdb__list_relations_without_caching", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__list_relations_without_caching", "macro_sql": "{% macro duckdb__list_relations_without_caching(schema_relation) %}\n {% call statement('list_relations_without_caching', fetch_result=True) -%}\n select\n '{{ schema_relation.database }}' as database,\n table_name as name,\n table_schema as schema,\n CASE table_type\n WHEN 'BASE TABLE' THEN 'table'\n WHEN 'VIEW' THEN 'view'\n WHEN 'LOCAL TEMPORARY' THEN 'table'\n END as type\n from system.information_schema.tables\n where table_schema = '{{ schema_relation.schema }}'\n and table_catalog = '{{ schema_relation.database }}'\n {% endcall %}\n {{ return(load_result('list_relations_without_caching').table) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.546856, "supported_languages": null}, "macro.dbt_duckdb.duckdb__drop_relation": {"name": "duckdb__drop_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__drop_relation", "macro_sql": "{% macro duckdb__drop_relation(relation) -%}\n {% call statement('drop_relation', auto_begin=False) -%}\n drop {{ relation.type }} if exists {{ relation }} cascade\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.54706, "supported_languages": null}, "macro.dbt_duckdb.duckdb__rename_relation": {"name": "duckdb__rename_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__rename_relation", "macro_sql": "{% macro duckdb__rename_relation(from_relation, to_relation) -%}\n {% set target_name = adapter.quote_as_configured(to_relation.identifier, 'identifier') %}\n {% call statement('rename_relation') -%}\n alter {{ to_relation.type }} {{ from_relation }} rename to {{ target_name }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5474062, "supported_languages": null}, "macro.dbt_duckdb.duckdb__make_temp_relation": {"name": "duckdb__make_temp_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__make_temp_relation", "macro_sql": "{% macro duckdb__make_temp_relation(base_relation, suffix) %}\n {% set tmp_identifier = base_relation.identifier ~ suffix ~ py_current_timestring() %}\n {% do return(base_relation.incorporate(\n path={\n \"identifier\": tmp_identifier,\n \"schema\": none,\n \"database\": none\n })) -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.py_current_timestring"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5477471, "supported_languages": null}, "macro.dbt_duckdb.duckdb__current_timestamp": {"name": "duckdb__current_timestamp", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__current_timestamp", "macro_sql": "{% macro duckdb__current_timestamp() -%}\n now()\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.547819, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_string_as_time": {"name": "duckdb__snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_string_as_time", "macro_sql": "{% macro duckdb__snapshot_string_as_time(timestamp) -%}\n {%- set result = \"'\" ~ timestamp ~ \"'::timestamp\" -%}\n {{ return(result) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.547984, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_get_time": {"name": "duckdb__snapshot_get_time", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_get_time", "macro_sql": "{% macro duckdb__snapshot_get_time() -%}\n {{ current_timestamp() }}::timestamp\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5480819, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_incremental_default_sql": {"name": "duckdb__get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_incremental_default_sql", "macro_sql": "{% macro duckdb__get_incremental_default_sql(arg_dict) %}\n {% do return(get_incremental_delete_insert_sql(arg_dict)) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_incremental_delete_insert_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.548219, "supported_languages": null}, "macro.dbt_duckdb.location_exists": {"name": "location_exists", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.location_exists", "macro_sql": "{% macro location_exists(location) -%}\n {% do return(adapter.location_exists(location)) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.548358, "supported_languages": null}, "macro.dbt_duckdb.write_to_file": {"name": "write_to_file", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.write_to_file", "macro_sql": "{% macro write_to_file(relation, location, options) -%}\n {% call statement('write_to_file') -%}\n copy {{ relation }} to '{{ location }}' ({{ options }})\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5485702, "supported_languages": null}, "macro.dbt_duckdb.store_relation": {"name": "store_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.store_relation", "macro_sql": "{% macro store_relation(plugin, relation, location, format) -%}\n {%- set column_list = adapter.get_columns_in_relation(relation) -%}\n {% do adapter.store_relation(plugin, relation, column_list, location, format) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.548842, "supported_languages": null}, "macro.dbt_duckdb.render_write_options": {"name": "render_write_options", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.render_write_options", "macro_sql": "{% macro render_write_options(config) -%}\n {% set options = config.get('options', {}) %}\n {% for k in options %}\n {% if options[k] is string %}\n {% set _ = options.update({k: render(options[k])}) %}\n {% else %}\n {% set _ = options.update({k: render(options[k])}) %}\n {% endif %}\n {% endfor %}\n\n {# legacy top-level write options #}\n {% if config.get('format') %}\n {% set _ = options.update({'format': render(config.get('format'))}) %}\n {% endif %}\n {% if config.get('delimiter') %}\n {% set _ = options.update({'delimiter': render(config.get('delimiter'))}) %}\n {% endif %}\n\n {% do return(options) %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.549858, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql": {"name": "duckdb__get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/incremental_helper.sql", "original_file_path": "macros/incremental_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql", "macro_sql": "{% macro duckdb__get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not string %}\n delete from {{target }} as DBT_INCREMENTAL_TARGET\n using {{ source }}\n where (\n {% for key in unique_key %}\n {{ source }}.{{ key }} = DBT_INCREMENTAL_TARGET.{{ key }}\n {{ \"and \" if not loop.last}}\n {% endfor %}\n {% if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {% endif %}\n );\n {% else %}\n delete from {{ target }}\n where (\n {{ unique_key }}) in (\n select ({{ unique_key }})\n from {{ source }}\n )\n {%- if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {%- endif -%};\n\n {% endif %}\n {% endif %}\n\n insert into {{ target }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ source }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.551906, "supported_languages": null}, "macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns": {"name": "duckdb__alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/columns.sql", "original_file_path": "macros/columns.sql", "unique_id": "macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns", "macro_sql": "{% macro duckdb__alter_relation_add_remove_columns(relation, add_columns, remove_columns) %}\n\n {% if add_columns %}\n {% for column in add_columns %}\n {% set sql -%}\n alter {{ relation.type }} {{ relation }} add column\n {{ column.name }} {{ column.data_type }}\n {%- endset -%}\n {% do run_query(sql) %}\n {% endfor %}\n {% endif %}\n\n {% if remove_columns %}\n {% for column in remove_columns %}\n {% set sql -%}\n alter {{ relation.type }} {{ relation }} drop column\n {{ column.name }}\n {%- endset -%}\n {% do run_query(sql) %}\n {% endfor %}\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.552841, "supported_languages": null}, "macro.dbt_duckdb.materialization_table_duckdb": {"name": "materialization_table_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/table.sql", "original_file_path": "macros/materializations/table.sql", "unique_id": "macro.dbt_duckdb.materialization_table_duckdb", "macro_sql": "{% materialization table, adapter=\"duckdb\", supported_languages=['sql', 'python'] %}\n\n {%- set language = model['language'] -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') %}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main', language=language) -%}\n {{- create_table_as(False, intermediate_relation, compiled_code, language) }}\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% do create_indexes(target_relation) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt.create_indexes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5555332, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.materialization_external_duckdb": {"name": "materialization_external_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/external.sql", "original_file_path": "macros/materializations/external.sql", "unique_id": "macro.dbt_duckdb.materialization_external_duckdb", "macro_sql": "{% materialization external, adapter=\"duckdb\", supported_languages=['sql', 'python'] %}\n\n {%- set location = render(config.get('location', default=external_location(this, config))) -%})\n {%- set rendered_options = render_write_options(config) -%}\n {%- set format = config.get('format', 'parquet') -%}\n {%- set write_options = adapter.external_write_options(location, rendered_options) -%}\n {%- set read_location = adapter.external_read_location(location, rendered_options) -%}\n\n -- set language - python or sql\n {%- set language = model['language'] -%}\n\n {%- set target_relation = this.incorporate(type='view') %}\n\n -- Continue as normal materialization\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set temp_relation = make_intermediate_relation(this.incorporate(type='table'), suffix='__dbt_tmp') -%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation, suffix='__dbt_int') -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_temp_relation = load_cached_relation(temp_relation) -%}\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_temp_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('create_table', language=language) -%}\n {{- create_table_as(False, temp_relation, compiled_code, language) }}\n {%- endcall %}\n\n -- write an temp relation into file\n {{ write_to_file(temp_relation, location, write_options) }}\n -- create a view on top of the location\n {% call statement('main', language='sql') -%}\n create or replace view {{ intermediate_relation }} as (\n select * from '{{ read_location }}'\n );\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n {{ drop_relation_if_exists(temp_relation) }}\n\n -- register table into glue\n {%- set plugin_name = config.get('plugin') -%}\n {%- set glue_register = config.get('glue_register', default=false) -%}\n {% if plugin_name is not none or glue_register is true %}\n {% if glue_register %}\n {# legacy hack to set the glue database name, deprecate this #}\n {%- set plugin_name = 'glue|' ~ config.get('glue_database', 'default') -%}\n {% endif %}\n {% do store_relation(plugin_name, target_relation, location, format) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt_duckdb.external_location", "macro.dbt_duckdb.render_write_options", "macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt_duckdb.write_to_file", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt_duckdb.store_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.560417, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.materialization_incremental_duckdb": {"name": "materialization_incremental_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/incremental.sql", "original_file_path": "macros/materializations/incremental.sql", "unique_id": "macro.dbt_duckdb.materialization_incremental_duckdb", "macro_sql": "{% materialization incremental, adapter=\"duckdb\", supported_languages=['sql', 'python'] -%}\n\n {%- set language = model['language'] -%}\n\n -- relations\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') -%}\n {%- set temp_relation = make_temp_relation(target_relation)-%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation)-%}\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n\n -- configs\n {%- set unique_key = config.get('unique_key') -%}\n {%- set full_refresh_mode = (should_full_refresh() or existing_relation.is_view) -%}\n {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%}\n\n -- the temp_ and backup_ relations should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation. This has to happen before\n -- BEGIN, in a separate transaction\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation)-%}\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set to_drop = [] %}\n\n {% if existing_relation is none %}\n {% set build_sql = create_table_as(False, target_relation, compiled_code, language) %}\n {% elif full_refresh_mode %}\n {% set build_sql = create_table_as(False, intermediate_relation, compiled_code, language) %}\n {% set need_swap = true %}\n {% else %}\n {% if language == 'python' %}\n {% set build_python = create_table_as(False, temp_relation, compiled_code, language) %}\n {% call statement(\"pre\", language=language) %}\n {{- build_python }}\n {% endcall %}\n {% else %} {# SQL #}\n {% do run_query(create_table_as(True, temp_relation, compiled_code, language)) %}\n {% endif %}\n {% do adapter.expand_target_column_types(\n from_relation=temp_relation,\n to_relation=target_relation) %}\n {#-- Process schema changes. Returns dict of changes if successful. Use source columns for upserting/merging --#}\n {% set dest_columns = process_schema_changes(on_schema_change, temp_relation, existing_relation) %}\n {% if not dest_columns %}\n {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}\n {% endif %}\n\n {#-- Get the incremental_strategy, the macro to use for the strategy, and build the sql --#}\n {% set incremental_strategy = config.get('incremental_strategy') or 'default' %}\n {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %}\n {% set strategy_sql_macro_func = adapter.get_incremental_strategy_macro(context, incremental_strategy) %}\n {% set strategy_arg_dict = ({'target_relation': target_relation, 'temp_relation': temp_relation, 'unique_key': unique_key, 'dest_columns': dest_columns, 'incremental_predicates': incremental_predicates }) %}\n {% set build_sql = strategy_sql_macro_func(strategy_arg_dict) %}\n {% set language = \"sql\" %}\n\n {% endif %}\n\n {% call statement(\"main\", language=language) %}\n {{- build_sql }}\n {% endcall %}\n\n {% if need_swap %}\n {% do adapter.rename_relation(target_relation, backup_relation) %}\n {% do adapter.rename_relation(intermediate_relation, target_relation) %}\n {% do to_drop.append(backup_relation) %}\n {% endif %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if existing_relation is none or existing_relation.is_view or should_full_refresh() %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {% do adapter.commit() %}\n\n {% for rel in to_drop %}\n {% do adapter.drop_relation(rel) %}\n {% endfor %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_temp_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.should_full_refresh", "macro.dbt.incremental_validate_on_schema_change", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.create_table_as", "macro.dbt.statement", "macro.dbt.run_query", "macro.dbt.process_schema_changes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.565703, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.duckdb__dateadd": {"name": "duckdb__dateadd", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt_duckdb.duckdb__dateadd", "macro_sql": "{% macro duckdb__dateadd(datepart, interval, from_date_or_timestamp) %}\n\n {{ from_date_or_timestamp }} + ((interval '1 {{ datepart }}') * ({{ interval }}))\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.56593, "supported_languages": null}, "macro.dbt_duckdb.duckdb__listagg": {"name": "duckdb__listagg", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt_duckdb.duckdb__listagg", "macro_sql": "{% macro duckdb__listagg(measure, delimiter_text, order_by_clause, limit_num) -%}\n {% if limit_num -%}\n list_aggr(\n (array_agg(\n {{ measure }}\n {% if order_by_clause -%}\n {{ order_by_clause }}\n {%- endif %}\n ))[1:{{ limit_num }}],\n 'string_agg',\n {{ delimiter_text }}\n )\n {%- else %}\n string_agg(\n {{ measure }},\n {{ delimiter_text }}\n {% if order_by_clause -%}\n {{ order_by_clause }}\n {%- endif %}\n )\n {%- endif %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.56652, "supported_languages": null}, "macro.dbt_duckdb.duckdb__datediff": {"name": "duckdb__datediff", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt_duckdb.duckdb__datediff", "macro_sql": "{% macro duckdb__datediff(first_date, second_date, datepart) -%}\n\n {% if datepart == 'year' %}\n (date_part('year', ({{second_date}})::date) - date_part('year', ({{first_date}})::date))\n {% elif datepart == 'quarter' %}\n ({{ datediff(first_date, second_date, 'year') }} * 4 + date_part('quarter', ({{second_date}})::date) - date_part('quarter', ({{first_date}})::date))\n {% elif datepart == 'month' %}\n ({{ datediff(first_date, second_date, 'year') }} * 12 + date_part('month', ({{second_date}})::date) - date_part('month', ({{first_date}})::date))\n {% elif datepart == 'day' %}\n (({{second_date}})::date - ({{first_date}})::date)\n {% elif datepart == 'week' %}\n ({{ datediff(first_date, second_date, 'day') }} / 7 + case\n when date_part('dow', ({{first_date}})::timestamp) <= date_part('dow', ({{second_date}})::timestamp) then\n case when {{first_date}} <= {{second_date}} then 0 else -1 end\n else\n case when {{first_date}} <= {{second_date}} then 1 else 0 end\n end)\n {% elif datepart == 'hour' %}\n ({{ datediff(first_date, second_date, 'day') }} * 24 + date_part('hour', ({{second_date}})::timestamp) - date_part('hour', ({{first_date}})::timestamp))\n {% elif datepart == 'minute' %}\n ({{ datediff(first_date, second_date, 'hour') }} * 60 + date_part('minute', ({{second_date}})::timestamp) - date_part('minute', ({{first_date}})::timestamp))\n {% elif datepart == 'second' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60 + floor(date_part('second', ({{second_date}})::timestamp)) - floor(date_part('second', ({{first_date}})::timestamp)))\n {% elif datepart == 'millisecond' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60000 + floor(date_part('millisecond', ({{second_date}})::timestamp)) - floor(date_part('millisecond', ({{first_date}})::timestamp)))\n {% elif datepart == 'microsecond' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60000000 + floor(date_part('microsecond', ({{second_date}})::timestamp)) - floor(date_part('microsecond', ({{first_date}})::timestamp)))\n {% else %}\n {{ exceptions.raise_compiler_error(\"Unsupported datepart for macro datediff in postgres: {!r}\".format(datepart)) }}\n {% endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.datediff"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5695648, "supported_languages": null}, "macro.dbt_duckdb.duckdb__any_value": {"name": "duckdb__any_value", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt_duckdb.duckdb__any_value", "macro_sql": "{% macro duckdb__any_value(expression) -%}\n\n arbitrary({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5697, "supported_languages": null}, "macro.dbt_duckdb.register_upstream_external_models": {"name": "register_upstream_external_models", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/upstream.sql", "original_file_path": "macros/utils/upstream.sql", "unique_id": "macro.dbt_duckdb.register_upstream_external_models", "macro_sql": "{%- macro register_upstream_external_models() -%}\n{% if execute %}\n{% set upstream_nodes = {} %}\n{% set upstream_schemas = {} %}\n{% for node in selected_resources %}\n {% for upstream_node in graph['nodes'][node]['depends_on']['nodes'] %}\n {% if upstream_node not in upstream_nodes and upstream_node not in selected_resources %}\n {% do upstream_nodes.update({upstream_node: None}) %}\n {% set upstream = graph['nodes'].get(upstream_node) %}\n {% if upstream\n and upstream.resource_type in ('model', 'seed')\n and upstream.config.materialized=='external'\n %}\n {%- set upstream_rel = api.Relation.create(\n database=upstream['database'],\n schema=upstream['schema'],\n identifier=upstream['alias']\n ) -%}\n {%- set location = upstream.config.get('location', external_location(upstream_rel, upstream.config)) -%}\n {%- set rendered_options = render_write_options(upstream.config) -%}\n {%- set upstream_location = adapter.external_read_location(location, rendered_options) -%}\n {% if upstream_rel.schema not in upstream_schemas %}\n {% call statement('main', language='sql') -%}\n create schema if not exists {{ upstream_rel.schema }}\n {%- endcall %}\n {% do upstream_schemas.update({upstream_rel.schema: None}) %}\n {% endif %}\n {% call statement('main', language='sql') -%}\n create or replace view {{ upstream_rel }} as (\n select * from '{{ upstream_location }}'\n );\n {%- endcall %}\n {%- endif %}\n {% endif %}\n {% endfor %}\n{% endfor %}\n{% do adapter.commit() %}\n{% endif %}\n{%- endmacro -%}", "depends_on": {"macros": ["macro.dbt_duckdb.external_location", "macro.dbt_duckdb.render_write_options", "macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.572073, "supported_languages": null}, "macro.dbt_duckdb.duckdb__split_part": {"name": "duckdb__split_part", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/splitpart.sql", "original_file_path": "macros/utils/splitpart.sql", "unique_id": "macro.dbt_duckdb.duckdb__split_part", "macro_sql": "{% macro duckdb__split_part(string_text, delimiter_text, part_number) %}\n string_split({{ string_text }}, {{ delimiter_text }})[ {{ part_number }} ]\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.572278, "supported_languages": null}, "macro.dbt_duckdb.duckdb__last_day": {"name": "duckdb__last_day", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/lastday.sql", "original_file_path": "macros/utils/lastday.sql", "unique_id": "macro.dbt_duckdb.duckdb__last_day", "macro_sql": "{% macro duckdb__last_day(date, datepart) -%}\n\n {%- if datepart == 'quarter' -%}\n -- duckdb dateadd does not support quarter interval.\n cast(\n {{dbt.dateadd('day', '-1',\n dbt.dateadd('month', '3', dbt.date_trunc(datepart, date))\n )}}\n as date)\n {%- else -%}\n {{dbt.default_last_day(date, datepart)}}\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.dateadd", "macro.dbt.date_trunc", "macro.dbt.default_last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.572741, "supported_languages": null}, "macro.dbt_duckdb.external_location": {"name": "external_location", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/external_location.sql", "original_file_path": "macros/utils/external_location.sql", "unique_id": "macro.dbt_duckdb.external_location", "macro_sql": "{%- macro external_location(relation, config) -%}\n {%- if config.get('options', {}).get('partition_by') is none -%}\n {%- set format = config.get('format', 'parquet') -%}\n {{- adapter.external_root() }}/{{ relation.identifier }}.{{ format }}\n {%- else -%}\n {{- adapter.external_root() }}/{{ relation.identifier }}\n {%- endif -%}\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.573282, "supported_languages": null}, "macro.dbt.run_hooks": {"name": "run_hooks", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.run_hooks", "macro_sql": "{% macro run_hooks(hooks, inside_transaction=True) %}\n {% for hook in hooks | selectattr('transaction', 'equalto', inside_transaction) %}\n {% if not inside_transaction and loop.first %}\n {% call statement(auto_begin=inside_transaction) %}\n commit;\n {% endcall %}\n {% endif %}\n {% set rendered = render(hook.get('sql')) | trim %}\n {% if (rendered | length) > 0 %}\n {% call statement(auto_begin=inside_transaction) %}\n {{ rendered }}\n {% endcall %}\n {% endif %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5743082, "supported_languages": null}, "macro.dbt.make_hook_config": {"name": "make_hook_config", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.make_hook_config", "macro_sql": "{% macro make_hook_config(sql, inside_transaction) %}\n {{ tojson({\"sql\": sql, \"transaction\": inside_transaction}) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.57449, "supported_languages": null}, "macro.dbt.before_begin": {"name": "before_begin", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.before_begin", "macro_sql": "{% macro before_begin(sql) %}\n {{ make_hook_config(sql, inside_transaction=False) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.574634, "supported_languages": null}, "macro.dbt.in_transaction": {"name": "in_transaction", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.in_transaction", "macro_sql": "{% macro in_transaction(sql) %}\n {{ make_hook_config(sql, inside_transaction=True) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.574765, "supported_languages": null}, "macro.dbt.after_commit": {"name": "after_commit", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.after_commit", "macro_sql": "{% macro after_commit(sql) %}\n {{ make_hook_config(sql, inside_transaction=False) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.574899, "supported_languages": null}, "macro.dbt.set_sql_header": {"name": "set_sql_header", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.set_sql_header", "macro_sql": "{% macro set_sql_header(config) -%}\n {{ config.set('sql_header', caller()) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.57525, "supported_languages": null}, "macro.dbt.should_full_refresh": {"name": "should_full_refresh", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.should_full_refresh", "macro_sql": "{% macro should_full_refresh() %}\n {% set config_full_refresh = config.get('full_refresh') %}\n {% if config_full_refresh is none %}\n {% set config_full_refresh = flags.FULL_REFRESH %}\n {% endif %}\n {% do return(config_full_refresh) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.575542, "supported_languages": null}, "macro.dbt.should_store_failures": {"name": "should_store_failures", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.should_store_failures", "macro_sql": "{% macro should_store_failures() %}\n {% set config_store_failures = config.get('store_failures') %}\n {% if config_store_failures is none %}\n {% set config_store_failures = flags.STORE_FAILURES %}\n {% endif %}\n {% do return(config_store_failures) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5758321, "supported_languages": null}, "macro.dbt.snapshot_merge_sql": {"name": "snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot_merge.sql", "original_file_path": "macros/materializations/snapshots/snapshot_merge.sql", "unique_id": "macro.dbt.snapshot_merge_sql", "macro_sql": "{% macro snapshot_merge_sql(target, source, insert_cols) -%}\n {{ adapter.dispatch('snapshot_merge_sql', 'dbt')(target, source, insert_cols) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5762482, "supported_languages": null}, "macro.dbt.default__snapshot_merge_sql": {"name": "default__snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot_merge.sql", "original_file_path": "macros/materializations/snapshots/snapshot_merge.sql", "unique_id": "macro.dbt.default__snapshot_merge_sql", "macro_sql": "{% macro default__snapshot_merge_sql(target, source, insert_cols) -%}\n {%- set insert_cols_csv = insert_cols | join(', ') -%}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on DBT_INTERNAL_SOURCE.dbt_scd_id = DBT_INTERNAL_DEST.dbt_scd_id\n\n when matched\n and DBT_INTERNAL_DEST.dbt_valid_to is null\n and DBT_INTERNAL_SOURCE.dbt_change_type in ('update', 'delete')\n then update\n set dbt_valid_to = DBT_INTERNAL_SOURCE.dbt_valid_to\n\n when not matched\n and DBT_INTERNAL_SOURCE.dbt_change_type = 'insert'\n then insert ({{ insert_cols_csv }})\n values ({{ insert_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5765061, "supported_languages": null}, "macro.dbt.strategy_dispatch": {"name": "strategy_dispatch", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.strategy_dispatch", "macro_sql": "{% macro strategy_dispatch(name) -%}\n{% set original_name = name %}\n {% if '.' in name %}\n {% set package_name, name = name.split(\".\", 1) %}\n {% else %}\n {% set package_name = none %}\n {% endif %}\n\n {% if package_name is none %}\n {% set package_context = context %}\n {% elif package_name in context %}\n {% set package_context = context[package_name] %}\n {% else %}\n {% set error_msg %}\n Could not find package '{{package_name}}', called with '{{original_name}}'\n {% endset %}\n {{ exceptions.raise_compiler_error(error_msg | trim) }}\n {% endif %}\n\n {%- set search_name = 'snapshot_' ~ name ~ '_strategy' -%}\n\n {% if search_name not in package_context %}\n {% set error_msg %}\n The specified strategy macro '{{name}}' was not found in package '{{ package_name }}'\n {% endset %}\n {{ exceptions.raise_compiler_error(error_msg | trim) }}\n {% endif %}\n {{ return(package_context[search_name]) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.579957, "supported_languages": null}, "macro.dbt.snapshot_hash_arguments": {"name": "snapshot_hash_arguments", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_hash_arguments", "macro_sql": "{% macro snapshot_hash_arguments(args) -%}\n {{ adapter.dispatch('snapshot_hash_arguments', 'dbt')(args) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5801148, "supported_languages": null}, "macro.dbt.default__snapshot_hash_arguments": {"name": "default__snapshot_hash_arguments", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.default__snapshot_hash_arguments", "macro_sql": "{% macro default__snapshot_hash_arguments(args) -%}\n md5({%- for arg in args -%}\n coalesce(cast({{ arg }} as varchar ), '')\n {% if not loop.last %} || '|' || {% endif %}\n {%- endfor -%})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.580321, "supported_languages": null}, "macro.dbt.snapshot_timestamp_strategy": {"name": "snapshot_timestamp_strategy", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_timestamp_strategy", "macro_sql": "{% macro snapshot_timestamp_strategy(node, snapshotted_rel, current_rel, config, target_exists) %}\n {% set primary_key = config['unique_key'] %}\n {% set updated_at = config['updated_at'] %}\n {% set invalidate_hard_deletes = config.get('invalidate_hard_deletes', false) %}\n\n {#/*\n The snapshot relation might not have an {{ updated_at }} value if the\n snapshot strategy is changed from `check` to `timestamp`. We\n should use a dbt-created column for the comparison in the snapshot\n table instead of assuming that the user-supplied {{ updated_at }}\n will be present in the historical data.\n\n See https://github.com/dbt-labs/dbt-core/issues/2350\n */ #}\n {% set row_changed_expr -%}\n ({{ snapshotted_rel }}.dbt_valid_from < {{ current_rel }}.{{ updated_at }})\n {%- endset %}\n\n {% set scd_id_expr = snapshot_hash_arguments([primary_key, updated_at]) %}\n\n {% do return({\n \"unique_key\": primary_key,\n \"updated_at\": updated_at,\n \"row_changed\": row_changed_expr,\n \"scd_id\": scd_id_expr,\n \"invalidate_hard_deletes\": invalidate_hard_deletes\n }) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.580993, "supported_languages": null}, "macro.dbt.snapshot_string_as_time": {"name": "snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_string_as_time", "macro_sql": "{% macro snapshot_string_as_time(timestamp) -%}\n {{ adapter.dispatch('snapshot_string_as_time', 'dbt')(timestamp) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_string_as_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.581146, "supported_languages": null}, "macro.dbt.default__snapshot_string_as_time": {"name": "default__snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.default__snapshot_string_as_time", "macro_sql": "{% macro default__snapshot_string_as_time(timestamp) %}\n {% do exceptions.raise_not_implemented(\n 'snapshot_string_as_time macro not implemented for adapter '+adapter.type()\n ) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.58131, "supported_languages": null}, "macro.dbt.snapshot_check_all_get_existing_columns": {"name": "snapshot_check_all_get_existing_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_check_all_get_existing_columns", "macro_sql": "{% macro snapshot_check_all_get_existing_columns(node, target_exists, check_cols_config) -%}\n {%- if not target_exists -%}\n {#-- no table yet -> return whatever the query does --#}\n {{ return((false, query_columns)) }}\n {%- endif -%}\n\n {#-- handle any schema changes --#}\n {%- set target_relation = adapter.get_relation(database=node.database, schema=node.schema, identifier=node.alias) -%}\n\n {% if check_cols_config == 'all' %}\n {%- set query_columns = get_columns_in_query(node['compiled_code']) -%}\n\n {% elif check_cols_config is iterable and (check_cols_config | length) > 0 %}\n {#-- query for proper casing/quoting, to support comparison below --#}\n {%- set select_check_cols_from_target -%}\n {#-- N.B. The whitespace below is necessary to avoid edge case issue with comments --#}\n {#-- See: https://github.com/dbt-labs/dbt-core/issues/6781 --#}\n select {{ check_cols_config | join(', ') }} from (\n {{ node['compiled_code'] }}\n ) subq\n {%- endset -%}\n {% set query_columns = get_columns_in_query(select_check_cols_from_target) %}\n\n {% else %}\n {% do exceptions.raise_compiler_error(\"Invalid value for 'check_cols': \" ~ check_cols_config) %}\n {% endif %}\n\n {%- set existing_cols = adapter.get_columns_in_relation(target_relation) | map(attribute = 'name') | list -%}\n {%- set ns = namespace() -%} {#-- handle for-loop scoping with a namespace --#}\n {%- set ns.column_added = false -%}\n\n {%- set intersection = [] -%}\n {%- for col in query_columns -%}\n {%- if col in existing_cols -%}\n {%- do intersection.append(adapter.quote(col)) -%}\n {%- else -%}\n {% set ns.column_added = true %}\n {%- endif -%}\n {%- endfor -%}\n {{ return((ns.column_added, intersection)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_columns_in_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5826309, "supported_languages": null}, "macro.dbt.snapshot_check_strategy": {"name": "snapshot_check_strategy", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_check_strategy", "macro_sql": "{% macro snapshot_check_strategy(node, snapshotted_rel, current_rel, config, target_exists) %}\n {% set check_cols_config = config['check_cols'] %}\n {% set primary_key = config['unique_key'] %}\n {% set invalidate_hard_deletes = config.get('invalidate_hard_deletes', false) %}\n {% set updated_at = config.get('updated_at', snapshot_get_time()) %}\n\n {% set column_added = false %}\n\n {% set column_added, check_cols = snapshot_check_all_get_existing_columns(node, target_exists, check_cols_config) %}\n\n {%- set row_changed_expr -%}\n (\n {%- if column_added -%}\n {{ get_true_sql() }}\n {%- else -%}\n {%- for col in check_cols -%}\n {{ snapshotted_rel }}.{{ col }} != {{ current_rel }}.{{ col }}\n or\n (\n (({{ snapshotted_rel }}.{{ col }} is null) and not ({{ current_rel }}.{{ col }} is null))\n or\n ((not {{ snapshotted_rel }}.{{ col }} is null) and ({{ current_rel }}.{{ col }} is null))\n )\n {%- if not loop.last %} or {% endif -%}\n {%- endfor -%}\n {%- endif -%}\n )\n {%- endset %}\n\n {% set scd_id_expr = snapshot_hash_arguments([primary_key, updated_at]) %}\n\n {% do return({\n \"unique_key\": primary_key,\n \"updated_at\": updated_at,\n \"row_changed\": row_changed_expr,\n \"scd_id\": scd_id_expr,\n \"invalidate_hard_deletes\": invalidate_hard_deletes\n }) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_get_time", "macro.dbt.snapshot_check_all_get_existing_columns", "macro.dbt.get_true_sql", "macro.dbt.snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.583895, "supported_languages": null}, "macro.dbt.create_columns": {"name": "create_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.create_columns", "macro_sql": "{% macro create_columns(relation, columns) %}\n {{ adapter.dispatch('create_columns', 'dbt')(relation, columns) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5878289, "supported_languages": null}, "macro.dbt.default__create_columns": {"name": "default__create_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__create_columns", "macro_sql": "{% macro default__create_columns(relation, columns) %}\n {% for column in columns %}\n {% call statement() %}\n alter table {{ relation }} add column \"{{ column.name }}\" {{ column.data_type }};\n {% endcall %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.588088, "supported_languages": null}, "macro.dbt.post_snapshot": {"name": "post_snapshot", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.post_snapshot", "macro_sql": "{% macro post_snapshot(staging_relation) %}\n {{ adapter.dispatch('post_snapshot', 'dbt')(staging_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__post_snapshot"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5882418, "supported_languages": null}, "macro.dbt.default__post_snapshot": {"name": "default__post_snapshot", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__post_snapshot", "macro_sql": "{% macro default__post_snapshot(staging_relation) %}\n {# no-op #}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.588324, "supported_languages": null}, "macro.dbt.get_true_sql": {"name": "get_true_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.get_true_sql", "macro_sql": "{% macro get_true_sql() %}\n {{ adapter.dispatch('get_true_sql', 'dbt')() }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_true_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.588455, "supported_languages": null}, "macro.dbt.default__get_true_sql": {"name": "default__get_true_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__get_true_sql", "macro_sql": "{% macro default__get_true_sql() %}\n {{ return('TRUE') }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5885599, "supported_languages": null}, "macro.dbt.snapshot_staging_table": {"name": "snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.snapshot_staging_table", "macro_sql": "{% macro snapshot_staging_table(strategy, source_sql, target_relation) -%}\n {{ adapter.dispatch('snapshot_staging_table', 'dbt')(strategy, source_sql, target_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__snapshot_staging_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5887501, "supported_languages": null}, "macro.dbt.default__snapshot_staging_table": {"name": "default__snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__snapshot_staging_table", "macro_sql": "{% macro default__snapshot_staging_table(strategy, source_sql, target_relation) -%}\n\n with snapshot_query as (\n\n {{ source_sql }}\n\n ),\n\n snapshotted_data as (\n\n select *,\n {{ strategy.unique_key }} as dbt_unique_key\n\n from {{ target_relation }}\n where dbt_valid_to is null\n\n ),\n\n insertions_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n nullif({{ strategy.updated_at }}, {{ strategy.updated_at }}) as dbt_valid_to,\n {{ strategy.scd_id }} as dbt_scd_id\n\n from snapshot_query\n ),\n\n updates_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n {{ strategy.updated_at }} as dbt_valid_to\n\n from snapshot_query\n ),\n\n {%- if strategy.invalidate_hard_deletes %}\n\n deletes_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key\n from snapshot_query\n ),\n {% endif %}\n\n insertions as (\n\n select\n 'insert' as dbt_change_type,\n source_data.*\n\n from insertions_source_data as source_data\n left outer join snapshotted_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where snapshotted_data.dbt_unique_key is null\n or (\n snapshotted_data.dbt_unique_key is not null\n and (\n {{ strategy.row_changed }}\n )\n )\n\n ),\n\n updates as (\n\n select\n 'update' as dbt_change_type,\n source_data.*,\n snapshotted_data.dbt_scd_id\n\n from updates_source_data as source_data\n join snapshotted_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where (\n {{ strategy.row_changed }}\n )\n )\n\n {%- if strategy.invalidate_hard_deletes -%}\n ,\n\n deletes as (\n\n select\n 'delete' as dbt_change_type,\n source_data.*,\n {{ snapshot_get_time() }} as dbt_valid_from,\n {{ snapshot_get_time() }} as dbt_updated_at,\n {{ snapshot_get_time() }} as dbt_valid_to,\n snapshotted_data.dbt_scd_id\n\n from snapshotted_data\n left join deletes_source_data as source_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where source_data.dbt_unique_key is null\n )\n {%- endif %}\n\n select * from insertions\n union all\n select * from updates\n {%- if strategy.invalidate_hard_deletes %}\n union all\n select * from deletes\n {%- endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_get_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.58958, "supported_languages": null}, "macro.dbt.build_snapshot_table": {"name": "build_snapshot_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.build_snapshot_table", "macro_sql": "{% macro build_snapshot_table(strategy, sql) -%}\n {{ adapter.dispatch('build_snapshot_table', 'dbt')(strategy, sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__build_snapshot_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.589758, "supported_languages": null}, "macro.dbt.default__build_snapshot_table": {"name": "default__build_snapshot_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__build_snapshot_table", "macro_sql": "{% macro default__build_snapshot_table(strategy, sql) %}\n\n select *,\n {{ strategy.scd_id }} as dbt_scd_id,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n nullif({{ strategy.updated_at }}, {{ strategy.updated_at }}) as dbt_valid_to\n from (\n {{ sql }}\n ) sbq\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.589998, "supported_languages": null}, "macro.dbt.build_snapshot_staging_table": {"name": "build_snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.build_snapshot_staging_table", "macro_sql": "{% macro build_snapshot_staging_table(strategy, sql, target_relation) %}\n {% set temp_relation = make_temp_relation(target_relation) %}\n\n {% set select = snapshot_staging_table(strategy, sql, target_relation) %}\n\n {% call statement('build_snapshot_staging_relation') %}\n {{ create_table_as(True, temp_relation, select) }}\n {% endcall %}\n\n {% do return(temp_relation) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_temp_relation", "macro.dbt.snapshot_staging_table", "macro.dbt.statement", "macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.590425, "supported_languages": null}, "macro.dbt.materialization_snapshot_default": {"name": "materialization_snapshot_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot.sql", "original_file_path": "macros/materializations/snapshots/snapshot.sql", "unique_id": "macro.dbt.materialization_snapshot_default", "macro_sql": "{% materialization snapshot, default %}\n {%- set config = model['config'] -%}\n\n {%- set target_table = model.get('alias', model.get('name')) -%}\n\n {%- set strategy_name = config.get('strategy') -%}\n {%- set unique_key = config.get('unique_key') %}\n -- grab current tables grants config for comparision later on\n {%- set grant_config = config.get('grants') -%}\n\n {% set target_relation_exists, target_relation = get_or_create_relation(\n database=model.database,\n schema=model.schema,\n identifier=target_table,\n type='table') -%}\n\n {%- if not target_relation.is_table -%}\n {% do exceptions.relation_wrong_type(target_relation, 'table') %}\n {%- endif -%}\n\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set strategy_macro = strategy_dispatch(strategy_name) %}\n {% set strategy = strategy_macro(model, \"snapshotted_data\", \"source_data\", config, target_relation_exists) %}\n\n {% if not target_relation_exists %}\n\n {% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %}\n {% set final_sql = create_table_as(False, target_relation, build_sql) %}\n\n {% else %}\n\n {{ adapter.valid_snapshot_target(target_relation) }}\n\n {% set staging_table = build_snapshot_staging_table(strategy, sql, target_relation) %}\n\n -- this may no-op if the database does not require column expansion\n {% do adapter.expand_target_column_types(from_relation=staging_table,\n to_relation=target_relation) %}\n\n {% set missing_columns = adapter.get_missing_columns(staging_table, target_relation)\n | rejectattr('name', 'equalto', 'dbt_change_type')\n | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')\n | rejectattr('name', 'equalto', 'dbt_unique_key')\n | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')\n | list %}\n\n {% do create_columns(target_relation, missing_columns) %}\n\n {% set source_columns = adapter.get_columns_in_relation(staging_table)\n | rejectattr('name', 'equalto', 'dbt_change_type')\n | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')\n | rejectattr('name', 'equalto', 'dbt_unique_key')\n | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')\n | list %}\n\n {% set quoted_source_columns = [] %}\n {% for column in source_columns %}\n {% do quoted_source_columns.append(adapter.quote(column.name)) %}\n {% endfor %}\n\n {% set final_sql = snapshot_merge_sql(\n target = target_relation,\n source = staging_table,\n insert_cols = quoted_source_columns\n )\n %}\n\n {% endif %}\n\n {% call statement('main') %}\n {{ final_sql }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(target_relation_exists, full_refresh_mode=False) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if not target_relation_exists %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n {% if staging_table is defined %}\n {% do post_snapshot(staging_table) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.get_or_create_relation", "macro.dbt.run_hooks", "macro.dbt.strategy_dispatch", "macro.dbt.build_snapshot_table", "macro.dbt.create_table_as", "macro.dbt.build_snapshot_staging_table", "macro.dbt.create_columns", "macro.dbt.snapshot_merge_sql", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes", "macro.dbt.post_snapshot"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.5963762, "supported_languages": ["sql"]}, "macro.dbt.materialization_test_default": {"name": "materialization_test_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/test.sql", "original_file_path": "macros/materializations/tests/test.sql", "unique_id": "macro.dbt.materialization_test_default", "macro_sql": "{%- materialization test, default -%}\n\n {% set relations = [] %}\n\n {% if should_store_failures() %}\n\n {% set identifier = model['alias'] %}\n {% set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) %}\n {% set target_relation = api.Relation.create(\n identifier=identifier, schema=schema, database=database, type='table') -%} %}\n\n {% if old_relation %}\n {% do adapter.drop_relation(old_relation) %}\n {% endif %}\n\n {% call statement(auto_begin=True) %}\n {{ create_table_as(False, target_relation, sql) }}\n {% endcall %}\n\n {% do relations.append(target_relation) %}\n\n {% set main_sql %}\n select *\n from {{ target_relation }}\n {% endset %}\n\n {{ adapter.commit() }}\n\n {% else %}\n\n {% set main_sql = sql %}\n\n {% endif %}\n\n {% set limit = config.get('limit') %}\n {% set fail_calc = config.get('fail_calc') %}\n {% set warn_if = config.get('warn_if') %}\n {% set error_if = config.get('error_if') %}\n\n {% call statement('main', fetch_result=True) -%}\n\n {{ get_test_sql(main_sql, fail_calc, warn_if, error_if, limit)}}\n\n {%- endcall %}\n\n {{ return({'relations': relations}) }}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.should_store_failures", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt.get_test_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.598351, "supported_languages": ["sql"]}, "macro.dbt.get_test_sql": {"name": "get_test_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/helpers.sql", "original_file_path": "macros/materializations/tests/helpers.sql", "unique_id": "macro.dbt.get_test_sql", "macro_sql": "{% macro get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%}\n {{ adapter.dispatch('get_test_sql', 'dbt')(main_sql, fail_calc, warn_if, error_if, limit) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_test_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.598797, "supported_languages": null}, "macro.dbt.default__get_test_sql": {"name": "default__get_test_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/helpers.sql", "original_file_path": "macros/materializations/tests/helpers.sql", "unique_id": "macro.dbt.default__get_test_sql", "macro_sql": "{% macro default__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%}\n select\n {{ fail_calc }} as failures,\n {{ fail_calc }} {{ warn_if }} as should_warn,\n {{ fail_calc }} {{ error_if }} as should_error\n from (\n {{ main_sql }}\n {{ \"limit \" ~ limit if limit != none }}\n ) dbt_internal_test\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.599085, "supported_languages": null}, "macro.dbt.get_where_subquery": {"name": "get_where_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/where_subquery.sql", "original_file_path": "macros/materializations/tests/where_subquery.sql", "unique_id": "macro.dbt.get_where_subquery", "macro_sql": "{% macro get_where_subquery(relation) -%}\n {% do return(adapter.dispatch('get_where_subquery', 'dbt')(relation)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_where_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.599436, "supported_languages": null}, "macro.dbt.default__get_where_subquery": {"name": "default__get_where_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/where_subquery.sql", "original_file_path": "macros/materializations/tests/where_subquery.sql", "unique_id": "macro.dbt.default__get_where_subquery", "macro_sql": "{% macro default__get_where_subquery(relation) -%}\n {% set where = config.get('where', '') %}\n {% if where %}\n {%- set filtered -%}\n (select * from {{ relation }} where {{ where }}) dbt_subquery\n {%- endset -%}\n {% do return(filtered) %}\n {%- else -%}\n {% do return(relation) %}\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.599782, "supported_languages": null}, "macro.dbt.get_quoted_csv": {"name": "get_quoted_csv", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.get_quoted_csv", "macro_sql": "{% macro get_quoted_csv(column_names) %}\n\n {% set quoted = [] %}\n {% for col in column_names -%}\n {%- do quoted.append(adapter.quote(col)) -%}\n {%- endfor %}\n\n {%- set dest_cols_csv = quoted | join(', ') -%}\n {{ return(dest_cols_csv) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.601281, "supported_languages": null}, "macro.dbt.diff_columns": {"name": "diff_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.diff_columns", "macro_sql": "{% macro diff_columns(source_columns, target_columns) %}\n\n {% set result = [] %}\n {% set source_names = source_columns | map(attribute = 'column') | list %}\n {% set target_names = target_columns | map(attribute = 'column') | list %}\n\n {# --check whether the name attribute exists in the target - this does not perform a data type check #}\n {% for sc in source_columns %}\n {% if sc.name not in target_names %}\n {{ result.append(sc) }}\n {% endif %}\n {% endfor %}\n\n {{ return(result) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.601794, "supported_languages": null}, "macro.dbt.diff_column_data_types": {"name": "diff_column_data_types", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.diff_column_data_types", "macro_sql": "{% macro diff_column_data_types(source_columns, target_columns) %}\n\n {% set result = [] %}\n {% for sc in source_columns %}\n {% set tc = target_columns | selectattr(\"name\", \"equalto\", sc.name) | list | first %}\n {% if tc %}\n {% if sc.data_type != tc.data_type and not sc.can_expand_to(other_column=tc) %}\n {{ result.append( { 'column_name': tc.name, 'new_type': sc.data_type } ) }}\n {% endif %}\n {% endif %}\n {% endfor %}\n\n {{ return(result) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.602409, "supported_languages": null}, "macro.dbt.get_merge_update_columns": {"name": "get_merge_update_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.get_merge_update_columns", "macro_sql": "{% macro get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) %}\n {{ return(adapter.dispatch('get_merge_update_columns', 'dbt')(merge_update_columns, merge_exclude_columns, dest_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_merge_update_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.602627, "supported_languages": null}, "macro.dbt.default__get_merge_update_columns": {"name": "default__get_merge_update_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.default__get_merge_update_columns", "macro_sql": "{% macro default__get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) %}\n {%- set default_cols = dest_columns | map(attribute=\"quoted\") | list -%}\n\n {%- if merge_update_columns and merge_exclude_columns -%}\n {{ exceptions.raise_compiler_error(\n 'Model cannot specify merge_update_columns and merge_exclude_columns. Please update model to use only one config'\n )}}\n {%- elif merge_update_columns -%}\n {%- set update_columns = merge_update_columns -%}\n {%- elif merge_exclude_columns -%}\n {%- set update_columns = [] -%}\n {%- for column in dest_columns -%}\n {% if column.column | lower not in merge_exclude_columns | map(\"lower\") | list %}\n {%- do update_columns.append(column.quoted) -%}\n {% endif %}\n {%- endfor -%}\n {%- else -%}\n {%- set update_columns = default_cols -%}\n {%- endif -%}\n\n {{ return(update_columns) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.603294, "supported_languages": null}, "macro.dbt.get_merge_sql": {"name": "get_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_merge_sql", "macro_sql": "{% macro get_merge_sql(target, source, unique_key, dest_columns, incremental_predicates=none) -%}\n -- back compat for old kwarg name\n {% set incremental_predicates = kwargs.get('predicates', incremental_predicates) %}\n {{ adapter.dispatch('get_merge_sql', 'dbt')(target, source, unique_key, dest_columns, incremental_predicates) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.609513, "supported_languages": null}, "macro.dbt.default__get_merge_sql": {"name": "default__get_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_merge_sql", "macro_sql": "{% macro default__get_merge_sql(target, source, unique_key, dest_columns, incremental_predicates=none) -%}\n {%- set predicates = [] if incremental_predicates is none else [] + incremental_predicates -%}\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n {%- set merge_update_columns = config.get('merge_update_columns') -%}\n {%- set merge_exclude_columns = config.get('merge_exclude_columns') -%}\n {%- set update_columns = get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not mapping and unique_key is not string %}\n {% for key in unique_key %}\n {% set this_key_match %}\n DBT_INTERNAL_SOURCE.{{ key }} = DBT_INTERNAL_DEST.{{ key }}\n {% endset %}\n {% do predicates.append(this_key_match) %}\n {% endfor %}\n {% else %}\n {% set unique_key_match %}\n DBT_INTERNAL_SOURCE.{{ unique_key }} = DBT_INTERNAL_DEST.{{ unique_key }}\n {% endset %}\n {% do predicates.append(unique_key_match) %}\n {% endif %}\n {% else %}\n {% do predicates.append('FALSE') %}\n {% endif %}\n\n {{ sql_header if sql_header is not none }}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on {{\"(\" ~ predicates | join(\") and (\") ~ \")\"}}\n\n {% if unique_key %}\n when matched then update set\n {% for column_name in update_columns -%}\n {{ column_name }} = DBT_INTERNAL_SOURCE.{{ column_name }}\n {%- if not loop.last %}, {%- endif %}\n {%- endfor %}\n {% endif %}\n\n when not matched then insert\n ({{ dest_cols_csv }})\n values\n ({{ dest_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv", "macro.dbt.get_merge_update_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.611198, "supported_languages": null}, "macro.dbt.get_delete_insert_merge_sql": {"name": "get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_delete_insert_merge_sql", "macro_sql": "{% macro get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n {{ adapter.dispatch('get_delete_insert_merge_sql', 'dbt')(target, source, unique_key, dest_columns, incremental_predicates) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.611443, "supported_languages": null}, "macro.dbt.default__get_delete_insert_merge_sql": {"name": "default__get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_delete_insert_merge_sql", "macro_sql": "{% macro default__get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not string %}\n delete from {{target }}\n using {{ source }}\n where (\n {% for key in unique_key %}\n {{ source }}.{{ key }} = {{ target }}.{{ key }}\n {{ \"and \" if not loop.last}}\n {% endfor %}\n {% if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {% endif %}\n );\n {% else %}\n delete from {{ target }}\n where (\n {{ unique_key }}) in (\n select ({{ unique_key }})\n from {{ source }}\n )\n {%- if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {%- endif -%};\n\n {% endif %}\n {% endif %}\n\n insert into {{ target }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ source }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.612374, "supported_languages": null}, "macro.dbt.get_insert_overwrite_merge_sql": {"name": "get_insert_overwrite_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_insert_overwrite_merge_sql", "macro_sql": "{% macro get_insert_overwrite_merge_sql(target, source, dest_columns, predicates, include_sql_header=false) -%}\n {{ adapter.dispatch('get_insert_overwrite_merge_sql', 'dbt')(target, source, dest_columns, predicates, include_sql_header) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_insert_overwrite_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.612624, "supported_languages": null}, "macro.dbt.default__get_insert_overwrite_merge_sql": {"name": "default__get_insert_overwrite_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_insert_overwrite_merge_sql", "macro_sql": "{% macro default__get_insert_overwrite_merge_sql(target, source, dest_columns, predicates, include_sql_header) -%}\n {#-- The only time include_sql_header is True: --#}\n {#-- BigQuery + insert_overwrite strategy + \"static\" partitions config --#}\n {#-- We should consider including the sql header at the materialization level instead --#}\n\n {%- set predicates = [] if predicates is none else [] + predicates -%}\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none and include_sql_header }}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on FALSE\n\n when not matched by source\n {% if predicates %} and {{ predicates | join(' and ') }} {% endif %}\n then delete\n\n when not matched then insert\n ({{ dest_cols_csv }})\n values\n ({{ dest_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.61323, "supported_languages": null}, "macro.dbt.is_incremental": {"name": "is_incremental", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/is_incremental.sql", "original_file_path": "macros/materializations/models/incremental/is_incremental.sql", "unique_id": "macro.dbt.is_incremental", "macro_sql": "{% macro is_incremental() %}\n {#-- do not run introspective queries in parsing #}\n {% if not execute %}\n {{ return(False) }}\n {% else %}\n {% set relation = adapter.get_relation(this.database, this.schema, this.table) %}\n {{ return(relation is not none\n and relation.type == 'table'\n and model.config.materialized == 'incremental'\n and not should_full_refresh()) }}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.613816, "supported_languages": null}, "macro.dbt.get_incremental_append_sql": {"name": "get_incremental_append_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_append_sql", "macro_sql": "{% macro get_incremental_append_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_append_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_append_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.614646, "supported_languages": null}, "macro.dbt.default__get_incremental_append_sql": {"name": "default__get_incremental_append_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_append_sql", "macro_sql": "{% macro default__get_incremental_append_sql(arg_dict) %}\n\n {% do return(get_insert_into_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"dest_columns\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_insert_into_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.614862, "supported_languages": null}, "macro.dbt.get_incremental_delete_insert_sql": {"name": "get_incremental_delete_insert_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_delete_insert_sql", "macro_sql": "{% macro get_incremental_delete_insert_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_delete_insert_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_delete_insert_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.615038, "supported_languages": null}, "macro.dbt.default__get_incremental_delete_insert_sql": {"name": "default__get_incremental_delete_insert_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_delete_insert_sql", "macro_sql": "{% macro default__get_incremental_delete_insert_sql(arg_dict) %}\n\n {% do return(get_delete_insert_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"unique_key\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_delete_insert_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.615315, "supported_languages": null}, "macro.dbt.get_incremental_merge_sql": {"name": "get_incremental_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_merge_sql", "macro_sql": "{% macro get_incremental_merge_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_merge_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6155012, "supported_languages": null}, "macro.dbt.default__get_incremental_merge_sql": {"name": "default__get_incremental_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_merge_sql", "macro_sql": "{% macro default__get_incremental_merge_sql(arg_dict) %}\n\n {% do return(get_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"unique_key\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6157749, "supported_languages": null}, "macro.dbt.get_incremental_insert_overwrite_sql": {"name": "get_incremental_insert_overwrite_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_insert_overwrite_sql", "macro_sql": "{% macro get_incremental_insert_overwrite_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_insert_overwrite_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_insert_overwrite_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.615954, "supported_languages": null}, "macro.dbt.default__get_incremental_insert_overwrite_sql": {"name": "default__get_incremental_insert_overwrite_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_insert_overwrite_sql", "macro_sql": "{% macro default__get_incremental_insert_overwrite_sql(arg_dict) %}\n\n {% do return(get_insert_overwrite_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_insert_overwrite_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.616199, "supported_languages": null}, "macro.dbt.get_incremental_default_sql": {"name": "get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_default_sql", "macro_sql": "{% macro get_incremental_default_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_default_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_incremental_default_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.616377, "supported_languages": null}, "macro.dbt.default__get_incremental_default_sql": {"name": "default__get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_default_sql", "macro_sql": "{% macro default__get_incremental_default_sql(arg_dict) %}\n\n {% do return(get_incremental_append_sql(arg_dict)) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_incremental_append_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6165152, "supported_languages": null}, "macro.dbt.get_insert_into_sql": {"name": "get_insert_into_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_insert_into_sql", "macro_sql": "{% macro get_insert_into_sql(target_relation, temp_relation, dest_columns) %}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n insert into {{ target_relation }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ temp_relation }}\n )\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.616771, "supported_languages": null}, "macro.dbt.materialization_incremental_default": {"name": "materialization_incremental_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/incremental.sql", "original_file_path": "macros/materializations/models/incremental/incremental.sql", "unique_id": "macro.dbt.materialization_incremental_default", "macro_sql": "{% materialization incremental, default -%}\n\n -- relations\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') -%}\n {%- set temp_relation = make_temp_relation(target_relation)-%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation)-%}\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n\n -- configs\n {%- set unique_key = config.get('unique_key') -%}\n {%- set full_refresh_mode = (should_full_refresh() or existing_relation.is_view) -%}\n {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%}\n\n -- the temp_ and backup_ relations should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation. This has to happen before\n -- BEGIN, in a separate transaction\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation)-%}\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set to_drop = [] %}\n\n {% if existing_relation is none %}\n {% set build_sql = get_create_table_as_sql(False, target_relation, sql) %}\n {% elif full_refresh_mode %}\n {% set build_sql = get_create_table_as_sql(False, intermediate_relation, sql) %}\n {% set need_swap = true %}\n {% else %}\n {% do run_query(get_create_table_as_sql(True, temp_relation, sql)) %}\n {% do adapter.expand_target_column_types(\n from_relation=temp_relation,\n to_relation=target_relation) %}\n {#-- Process schema changes. Returns dict of changes if successful. Use source columns for upserting/merging --#}\n {% set dest_columns = process_schema_changes(on_schema_change, temp_relation, existing_relation) %}\n {% if not dest_columns %}\n {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}\n {% endif %}\n\n {#-- Get the incremental_strategy, the macro to use for the strategy, and build the sql --#}\n {% set incremental_strategy = config.get('incremental_strategy') or 'default' %}\n {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %}\n {% set strategy_sql_macro_func = adapter.get_incremental_strategy_macro(context, incremental_strategy) %}\n {% set strategy_arg_dict = ({'target_relation': target_relation, 'temp_relation': temp_relation, 'unique_key': unique_key, 'dest_columns': dest_columns, 'incremental_predicates': incremental_predicates }) %}\n {% set build_sql = strategy_sql_macro_func(strategy_arg_dict) %}\n\n {% endif %}\n\n {% call statement(\"main\") %}\n {{ build_sql }}\n {% endcall %}\n\n {% if need_swap %}\n {% do adapter.rename_relation(target_relation, backup_relation) %}\n {% do adapter.rename_relation(intermediate_relation, target_relation) %}\n {% do to_drop.append(backup_relation) %}\n {% endif %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if existing_relation is none or existing_relation.is_view or should_full_refresh() %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {% do adapter.commit() %}\n\n {% for rel in to_drop %}\n {% do adapter.drop_relation(rel) %}\n {% endfor %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_temp_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.should_full_refresh", "macro.dbt.incremental_validate_on_schema_change", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.get_create_table_as_sql", "macro.dbt.run_query", "macro.dbt.process_schema_changes", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.621505, "supported_languages": ["sql"]}, "macro.dbt.incremental_validate_on_schema_change": {"name": "incremental_validate_on_schema_change", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.incremental_validate_on_schema_change", "macro_sql": "{% macro incremental_validate_on_schema_change(on_schema_change, default='ignore') %}\n\n {% if on_schema_change not in ['sync_all_columns', 'append_new_columns', 'fail', 'ignore'] %}\n\n {% set log_message = 'Invalid value for on_schema_change (%s) specified. Setting default value of %s.' % (on_schema_change, default) %}\n {% do log(log_message) %}\n\n {{ return(default) }}\n\n {% else %}\n\n {{ return(on_schema_change) }}\n\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.627012, "supported_languages": null}, "macro.dbt.check_for_schema_changes": {"name": "check_for_schema_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.check_for_schema_changes", "macro_sql": "{% macro check_for_schema_changes(source_relation, target_relation) %}\n\n {% set schema_changed = False %}\n\n {%- set source_columns = adapter.get_columns_in_relation(source_relation) -%}\n {%- set target_columns = adapter.get_columns_in_relation(target_relation) -%}\n {%- set source_not_in_target = diff_columns(source_columns, target_columns) -%}\n {%- set target_not_in_source = diff_columns(target_columns, source_columns) -%}\n\n {% set new_target_types = diff_column_data_types(source_columns, target_columns) %}\n\n {% if source_not_in_target != [] %}\n {% set schema_changed = True %}\n {% elif target_not_in_source != [] or new_target_types != [] %}\n {% set schema_changed = True %}\n {% elif new_target_types != [] %}\n {% set schema_changed = True %}\n {% endif %}\n\n {% set changes_dict = {\n 'schema_changed': schema_changed,\n 'source_not_in_target': source_not_in_target,\n 'target_not_in_source': target_not_in_source,\n 'source_columns': source_columns,\n 'target_columns': target_columns,\n 'new_target_types': new_target_types\n } %}\n\n {% set msg %}\n In {{ target_relation }}:\n Schema changed: {{ schema_changed }}\n Source columns not in target: {{ source_not_in_target }}\n Target columns not in source: {{ target_not_in_source }}\n New column types: {{ new_target_types }}\n {% endset %}\n\n {% do log(msg) %}\n\n {{ return(changes_dict) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.diff_columns", "macro.dbt.diff_column_data_types"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.62815, "supported_languages": null}, "macro.dbt.sync_column_schemas": {"name": "sync_column_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.sync_column_schemas", "macro_sql": "{% macro sync_column_schemas(on_schema_change, target_relation, schema_changes_dict) %}\n\n {%- set add_to_target_arr = schema_changes_dict['source_not_in_target'] -%}\n\n {%- if on_schema_change == 'append_new_columns'-%}\n {%- if add_to_target_arr | length > 0 -%}\n {%- do alter_relation_add_remove_columns(target_relation, add_to_target_arr, none) -%}\n {%- endif -%}\n\n {% elif on_schema_change == 'sync_all_columns' %}\n {%- set remove_from_target_arr = schema_changes_dict['target_not_in_source'] -%}\n {%- set new_target_types = schema_changes_dict['new_target_types'] -%}\n\n {% if add_to_target_arr | length > 0 or remove_from_target_arr | length > 0 %}\n {%- do alter_relation_add_remove_columns(target_relation, add_to_target_arr, remove_from_target_arr) -%}\n {% endif %}\n\n {% if new_target_types != [] %}\n {% for ntt in new_target_types %}\n {% set column_name = ntt['column_name'] %}\n {% set new_type = ntt['new_type'] %}\n {% do alter_column_type(target_relation, column_name, new_type) %}\n {% endfor %}\n {% endif %}\n\n {% endif %}\n\n {% set schema_change_message %}\n In {{ target_relation }}:\n Schema change approach: {{ on_schema_change }}\n Columns added: {{ add_to_target_arr }}\n Columns removed: {{ remove_from_target_arr }}\n Data types changed: {{ new_target_types }}\n {% endset %}\n\n {% do log(schema_change_message) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.alter_relation_add_remove_columns", "macro.dbt.alter_column_type"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.629262, "supported_languages": null}, "macro.dbt.process_schema_changes": {"name": "process_schema_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.process_schema_changes", "macro_sql": "{% macro process_schema_changes(on_schema_change, source_relation, target_relation) %}\n\n {% if on_schema_change == 'ignore' %}\n\n {{ return({}) }}\n\n {% else %}\n\n {% set schema_changes_dict = check_for_schema_changes(source_relation, target_relation) %}\n\n {% if schema_changes_dict['schema_changed'] %}\n\n {% if on_schema_change == 'fail' %}\n\n {% set fail_msg %}\n The source and target schemas on this incremental model are out of sync!\n They can be reconciled in several ways:\n - set the `on_schema_change` config to either append_new_columns or sync_all_columns, depending on your situation.\n - Re-run the incremental model with `full_refresh: True` to update the target schema.\n - update the schema manually and re-run the process.\n\n Additional troubleshooting context:\n Source columns not in target: {{ schema_changes_dict['source_not_in_target'] }}\n Target columns not in source: {{ schema_changes_dict['target_not_in_source'] }}\n New column types: {{ schema_changes_dict['new_target_types'] }}\n {% endset %}\n\n {% do exceptions.raise_compiler_error(fail_msg) %}\n\n {# -- unless we ignore, run the sync operation per the config #}\n {% else %}\n\n {% do sync_column_schemas(on_schema_change, target_relation, schema_changes_dict) %}\n\n {% endif %}\n\n {% endif %}\n\n {{ return(schema_changes_dict['source_columns']) }}\n\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.check_for_schema_changes", "macro.dbt.sync_column_schemas"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.63006, "supported_languages": null}, "macro.dbt.materialization_materialized_view_default": {"name": "materialization_materialized_view_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialization_materialized_view_default", "macro_sql": "{% materialization materialized_view, default %}\n {% set existing_relation = load_cached_relation(this) %}\n {% set target_relation = this.incorporate(type=this.MaterializedView) %}\n {% set intermediate_relation = make_intermediate_relation(target_relation) %}\n {% set backup_relation_type = target_relation.MaterializedView if existing_relation is none else existing_relation.type %}\n {% set backup_relation = make_backup_relation(target_relation, backup_relation_type) %}\n\n {{ materialized_view_setup(backup_relation, intermediate_relation, pre_hooks) }}\n\n {% set build_sql = materialized_view_get_build_sql(existing_relation, target_relation, backup_relation, intermediate_relation) %}\n\n {% if build_sql == '' %}\n {{ materialized_view_execute_no_op(target_relation) }}\n {% else %}\n {{ materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) }}\n {% endif %}\n\n {{ materialized_view_teardown(backup_relation, intermediate_relation, post_hooks) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.materialized_view_setup", "macro.dbt.materialized_view_get_build_sql", "macro.dbt.materialized_view_execute_no_op", "macro.dbt.materialized_view_execute_build_sql", "macro.dbt.materialized_view_teardown"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6347861, "supported_languages": ["sql"]}, "macro.dbt.materialized_view_setup": {"name": "materialized_view_setup", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_setup", "macro_sql": "{% macro materialized_view_setup(backup_relation, intermediate_relation, pre_hooks) %}\n\n -- backup_relation and intermediate_relation should not already exist in the database\n -- it's possible these exist because of a previous run that exited unexpectedly\n {% set preexisting_backup_relation = load_cached_relation(backup_relation) %}\n {% set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6351528, "supported_languages": null}, "macro.dbt.materialized_view_teardown": {"name": "materialized_view_teardown", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_teardown", "macro_sql": "{% macro materialized_view_teardown(backup_relation, intermediate_relation, post_hooks) %}\n\n -- drop the temp relations if they exist to leave the database clean for the next run\n {{ drop_relation_if_exists(backup_relation) }}\n {{ drop_relation_if_exists(intermediate_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6353862, "supported_languages": null}, "macro.dbt.materialized_view_get_build_sql": {"name": "materialized_view_get_build_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_get_build_sql", "macro_sql": "{% macro materialized_view_get_build_sql(existing_relation, target_relation, backup_relation, intermediate_relation) %}\n\n {% set full_refresh_mode = should_full_refresh() %}\n\n -- determine the scenario we're in: create, full_refresh, alter, refresh data\n {% if existing_relation is none %}\n {% set build_sql = get_create_materialized_view_as_sql(target_relation, sql) %}\n {% elif full_refresh_mode or not existing_relation.is_materialized_view %}\n {% set build_sql = get_replace_materialized_view_as_sql(target_relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {% else %}\n\n -- get config options\n {% set on_configuration_change = config.get('on_configuration_change') %}\n {% set configuration_changes = get_materialized_view_configuration_changes(existing_relation, config) %}\n\n {% if configuration_changes is none %}\n {% set build_sql = refresh_materialized_view(target_relation) %}\n\n {% elif on_configuration_change == 'apply' %}\n {% set build_sql = get_alter_materialized_view_as_sql(target_relation, configuration_changes, sql, existing_relation, backup_relation, intermediate_relation) %}\n {% elif on_configuration_change == 'continue' %}\n {% set build_sql = '' %}\n {{ exceptions.warn(\"Configuration changes were identified and `on_configuration_change` was set to `continue` for `\" ~ target_relation ~ \"`\") }}\n {% elif on_configuration_change == 'fail' %}\n {{ exceptions.raise_fail_fast_error(\"Configuration changes were identified and `on_configuration_change` was set to `fail` for `\" ~ target_relation ~ \"`\") }}\n\n {% else %}\n -- this only happens if the user provides a value other than `apply`, 'skip', 'fail'\n {{ exceptions.raise_compiler_error(\"Unexpected configuration scenario\") }}\n\n {% endif %}\n\n {% endif %}\n\n {% do return(build_sql) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh", "macro.dbt.get_create_materialized_view_as_sql", "macro.dbt.get_replace_materialized_view_as_sql", "macro.dbt.get_materialized_view_configuration_changes", "macro.dbt.refresh_materialized_view", "macro.dbt.get_alter_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.636616, "supported_languages": null}, "macro.dbt.materialized_view_execute_no_op": {"name": "materialized_view_execute_no_op", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_execute_no_op", "macro_sql": "{% macro materialized_view_execute_no_op(target_relation) %}\n {% do store_raw_result(\n name=\"main\",\n message=\"skip \" ~ target_relation,\n code=\"skip\",\n rows_affected=\"-1\"\n ) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.636835, "supported_languages": null}, "macro.dbt.materialized_view_execute_build_sql": {"name": "materialized_view_execute_build_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_execute_build_sql", "macro_sql": "{% macro materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) %}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set grant_config = config.get('grants') %}\n\n {% call statement(name=\"main\") %}\n {{ build_sql }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.637456, "supported_languages": null}, "macro.dbt.get_materialized_view_configuration_changes": {"name": "get_materialized_view_configuration_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "original_file_path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "unique_id": "macro.dbt.get_materialized_view_configuration_changes", "macro_sql": "{% macro get_materialized_view_configuration_changes(existing_relation, new_config) %}\n /* {#\n It's recommended that configuration changes be formatted as follows:\n {\"\": [{\"action\": \"\", \"context\": ...}]}\n\n For example:\n {\n \"indexes\": [\n {\"action\": \"drop\", \"context\": \"index_abc\"},\n {\"action\": \"create\", \"context\": {\"columns\": [\"column_1\", \"column_2\"], \"type\": \"hash\", \"unique\": True}},\n ],\n }\n\n Either way, `get_materialized_view_configuration_changes` needs to align with `get_alter_materialized_view_as_sql`.\n #} */\n {{- log('Determining configuration changes on: ' ~ existing_relation) -}}\n {%- do return(adapter.dispatch('get_materialized_view_configuration_changes', 'dbt')(existing_relation, new_config)) -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_materialized_view_configuration_changes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.637855, "supported_languages": null}, "macro.dbt.default__get_materialized_view_configuration_changes": {"name": "default__get_materialized_view_configuration_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "original_file_path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "unique_id": "macro.dbt.default__get_materialized_view_configuration_changes", "macro_sql": "{% macro default__get_materialized_view_configuration_changes(existing_relation, new_config) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.63799, "supported_languages": null}, "macro.dbt.get_alter_materialized_view_as_sql": {"name": "get_alter_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "unique_id": "macro.dbt.get_alter_materialized_view_as_sql", "macro_sql": "{% macro get_alter_materialized_view_as_sql(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n) %}\n {{- log('Applying ALTER to: ' ~ relation) -}}\n {{- adapter.dispatch('get_alter_materialized_view_as_sql', 'dbt')(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n ) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_alter_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.638437, "supported_languages": null}, "macro.dbt.default__get_alter_materialized_view_as_sql": {"name": "default__get_alter_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "unique_id": "macro.dbt.default__get_alter_materialized_view_as_sql", "macro_sql": "{% macro default__get_alter_materialized_view_as_sql(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.638608, "supported_languages": null}, "macro.dbt.refresh_materialized_view": {"name": "refresh_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "unique_id": "macro.dbt.refresh_materialized_view", "macro_sql": "{% macro refresh_materialized_view(relation) %}\n {{- log('Applying REFRESH to: ' ~ relation) -}}\n {{- adapter.dispatch('refresh_materialized_view', 'dbt')(relation) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__refresh_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6388931, "supported_languages": null}, "macro.dbt.default__refresh_materialized_view": {"name": "default__refresh_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "unique_id": "macro.dbt.default__refresh_materialized_view", "macro_sql": "{% macro default__refresh_materialized_view(relation) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.639024, "supported_languages": null}, "macro.dbt.get_replace_materialized_view_as_sql": {"name": "get_replace_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "unique_id": "macro.dbt.get_replace_materialized_view_as_sql", "macro_sql": "{% macro get_replace_materialized_view_as_sql(relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {{- log('Applying REPLACE to: ' ~ relation) -}}\n {{- adapter.dispatch('get_replace_materialized_view_as_sql', 'dbt')(relation, sql, existing_relation, backup_relation, intermediate_relation) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_replace_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.639423, "supported_languages": null}, "macro.dbt.default__get_replace_materialized_view_as_sql": {"name": "default__get_replace_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "unique_id": "macro.dbt.default__get_replace_materialized_view_as_sql", "macro_sql": "{% macro default__get_replace_materialized_view_as_sql(relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.639648, "supported_languages": null}, "macro.dbt.get_create_materialized_view_as_sql": {"name": "get_create_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "unique_id": "macro.dbt.get_create_materialized_view_as_sql", "macro_sql": "{% macro get_create_materialized_view_as_sql(relation, sql) -%}\n {{- log('Applying CREATE to: ' ~ relation) -}}\n {{- adapter.dispatch('get_create_materialized_view_as_sql', 'dbt')(relation, sql) -}}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.639955, "supported_languages": null}, "macro.dbt.default__get_create_materialized_view_as_sql": {"name": "default__get_create_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "unique_id": "macro.dbt.default__get_create_materialized_view_as_sql", "macro_sql": "{% macro default__get_create_materialized_view_as_sql(relation, sql) -%}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.640086, "supported_languages": null}, "macro.dbt.can_clone_table": {"name": "can_clone_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/can_clone_table.sql", "original_file_path": "macros/materializations/models/clone/can_clone_table.sql", "unique_id": "macro.dbt.can_clone_table", "macro_sql": "{% macro can_clone_table() %}\n {{ return(adapter.dispatch('can_clone_table', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__can_clone_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6403022, "supported_languages": null}, "macro.dbt.default__can_clone_table": {"name": "default__can_clone_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/can_clone_table.sql", "original_file_path": "macros/materializations/models/clone/can_clone_table.sql", "unique_id": "macro.dbt.default__can_clone_table", "macro_sql": "{% macro default__can_clone_table() %}\n {{ return(False) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.640428, "supported_languages": null}, "macro.dbt.create_or_replace_clone": {"name": "create_or_replace_clone", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/create_or_replace_clone.sql", "original_file_path": "macros/materializations/models/clone/create_or_replace_clone.sql", "unique_id": "macro.dbt.create_or_replace_clone", "macro_sql": "{% macro create_or_replace_clone(this_relation, defer_relation) %}\n {{ return(adapter.dispatch('create_or_replace_clone', 'dbt')(this_relation, defer_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_or_replace_clone"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6406999, "supported_languages": null}, "macro.dbt.default__create_or_replace_clone": {"name": "default__create_or_replace_clone", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/create_or_replace_clone.sql", "original_file_path": "macros/materializations/models/clone/create_or_replace_clone.sql", "unique_id": "macro.dbt.default__create_or_replace_clone", "macro_sql": "{% macro default__create_or_replace_clone(this_relation, defer_relation) %}\n create or replace table {{ this_relation }} clone {{ defer_relation }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6408288, "supported_languages": null}, "macro.dbt.materialization_clone_default": {"name": "materialization_clone_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/clone.sql", "original_file_path": "macros/materializations/models/clone/clone.sql", "unique_id": "macro.dbt.materialization_clone_default", "macro_sql": "{%- materialization clone, default -%}\n\n {%- set relations = {'relations': []} -%}\n\n {%- if not defer_relation -%}\n -- nothing to do\n {{ log(\"No relation found in state manifest for \" ~ model.unique_id, info=True) }}\n {{ return(relations) }}\n {%- endif -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n\n {%- if existing_relation and not flags.FULL_REFRESH -%}\n -- noop!\n {{ log(\"Relation \" ~ existing_relation ~ \" already exists\", info=True) }}\n {{ return(relations) }}\n {%- endif -%}\n\n {%- set other_existing_relation = load_cached_relation(defer_relation) -%}\n\n -- If this is a database that can do zero-copy cloning of tables, and the other relation is a table, then this will be a table\n -- Otherwise, this will be a view\n\n {% set can_clone_table = can_clone_table() %}\n\n {%- if other_existing_relation and other_existing_relation.type == 'table' and can_clone_table -%}\n\n {%- set target_relation = this.incorporate(type='table') -%}\n {% if existing_relation is not none and not existing_relation.is_table %}\n {{ log(\"Dropping relation \" ~ existing_relation ~ \" because it is of type \" ~ existing_relation.type) }}\n {{ drop_relation_if_exists(existing_relation) }}\n {% endif %}\n\n -- as a general rule, data platforms that can clone tables can also do atomic 'create or replace'\n {% call statement('main') %}\n {{ create_or_replace_clone(target_relation, defer_relation) }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n {% do persist_docs(target_relation, model) %}\n\n {{ return({'relations': [target_relation]}) }}\n\n {%- else -%}\n\n {%- set target_relation = this.incorporate(type='view') -%}\n\n -- reuse the view materialization\n -- TODO: support actual dispatch for materialization macros\n -- Tracking ticket: https://github.com/dbt-labs/dbt-core/issues/7799\n {% set search_name = \"materialization_view_\" ~ adapter.type() %}\n {% if not search_name in context %}\n {% set search_name = \"materialization_view_default\" %}\n {% endif %}\n {% set materialization_macro = context[search_name] %}\n {% set relations = materialization_macro() %}\n {{ return(relations) }}\n\n {%- endif -%}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.can_clone_table", "macro.dbt.drop_relation_if_exists", "macro.dbt.statement", "macro.dbt.create_or_replace_clone", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6437929, "supported_languages": ["sql"]}, "macro.dbt.get_table_columns_and_constraints": {"name": "get_table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.get_table_columns_and_constraints", "macro_sql": "{%- macro get_table_columns_and_constraints() -%}\n {{ adapter.dispatch('get_table_columns_and_constraints', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__get_table_columns_and_constraints"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.644755, "supported_languages": null}, "macro.dbt.default__get_table_columns_and_constraints": {"name": "default__get_table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__get_table_columns_and_constraints", "macro_sql": "{% macro default__get_table_columns_and_constraints() -%}\n {{ return(table_columns_and_constraints()) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.table_columns_and_constraints"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6448681, "supported_languages": null}, "macro.dbt.table_columns_and_constraints": {"name": "table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.table_columns_and_constraints", "macro_sql": "{% macro table_columns_and_constraints() %}\n {# loop through user_provided_columns to create DDL with data types and constraints #}\n {%- set raw_column_constraints = adapter.render_raw_columns_constraints(raw_columns=model['columns']) -%}\n {%- set raw_model_constraints = adapter.render_raw_model_constraints(raw_constraints=model['constraints']) -%}\n (\n {% for c in raw_column_constraints -%}\n {{ c }}{{ \",\" if not loop.last or raw_model_constraints }}\n {% endfor %}\n {% for c in raw_model_constraints -%}\n {{ c }}{{ \",\" if not loop.last }}\n {% endfor -%}\n )\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.645374, "supported_languages": null}, "macro.dbt.get_assert_columns_equivalent": {"name": "get_assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.get_assert_columns_equivalent", "macro_sql": "\n\n{%- macro get_assert_columns_equivalent(sql) -%}\n {{ adapter.dispatch('get_assert_columns_equivalent', 'dbt')(sql) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.645528, "supported_languages": null}, "macro.dbt.default__get_assert_columns_equivalent": {"name": "default__get_assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__get_assert_columns_equivalent", "macro_sql": "{% macro default__get_assert_columns_equivalent(sql) -%}\n {{ return(assert_columns_equivalent(sql)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.645654, "supported_languages": null}, "macro.dbt.assert_columns_equivalent": {"name": "assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.assert_columns_equivalent", "macro_sql": "{% macro assert_columns_equivalent(sql) %}\n\n {#-- First ensure the user has defined 'columns' in yaml specification --#}\n {%- set user_defined_columns = model['columns'] -%}\n {%- if not user_defined_columns -%}\n {{ exceptions.raise_contract_error([], []) }}\n {%- endif -%}\n\n {#-- Obtain the column schema provided by sql file. #}\n {%- set sql_file_provided_columns = get_column_schema_from_query(sql, config.get('sql_header', none)) -%}\n {#--Obtain the column schema provided by the schema file by generating an 'empty schema' query from the model's columns. #}\n {%- set schema_file_provided_columns = get_column_schema_from_query(get_empty_schema_sql(user_defined_columns)) -%}\n\n {#-- create dictionaries with name and formatted data type and strings for exception #}\n {%- set sql_columns = format_columns(sql_file_provided_columns) -%}\n {%- set yaml_columns = format_columns(schema_file_provided_columns) -%}\n\n {%- if sql_columns|length != yaml_columns|length -%}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n\n {%- for sql_col in sql_columns -%}\n {%- set yaml_col = [] -%}\n {%- for this_col in yaml_columns -%}\n {%- if this_col['name'] == sql_col['name'] -%}\n {%- do yaml_col.append(this_col) -%}\n {%- break -%}\n {%- endif -%}\n {%- endfor -%}\n {%- if not yaml_col -%}\n {#-- Column with name not found in yaml #}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n {%- if sql_col['formatted'] != yaml_col[0]['formatted'] -%}\n {#-- Column data types don't match #}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n {%- endfor -%}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_column_schema_from_query", "macro.dbt.get_empty_schema_sql", "macro.dbt.format_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.646909, "supported_languages": null}, "macro.dbt.format_columns": {"name": "format_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.format_columns", "macro_sql": "{% macro format_columns(columns) %}\n {% set formatted_columns = [] %}\n {% for column in columns %}\n {%- set formatted_column = adapter.dispatch('format_column', 'dbt')(column) -%}\n {%- do formatted_columns.append(formatted_column) -%}\n {% endfor %}\n {{ return(formatted_columns) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__format_column"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.647261, "supported_languages": null}, "macro.dbt.default__format_column": {"name": "default__format_column", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__format_column", "macro_sql": "{% macro default__format_column(column) -%}\n {% set data_type = column.dtype %}\n {% set formatted = column.column.lower() ~ \" \" ~ data_type %}\n {{ return({'name': column.name, 'data_type': data_type, 'formatted': formatted}) }}\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.647587, "supported_languages": null}, "macro.dbt.materialization_table_default": {"name": "materialization_table_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/table.sql", "original_file_path": "macros/materializations/models/table/table.sql", "unique_id": "macro.dbt.materialization_table_default", "macro_sql": "{% materialization table, default %}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') %}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_table_as_sql(False, intermediate_relation, sql) }}\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n /* Do the equivalent of rename_if_exists. 'existing_relation' could have been dropped\n since the variable was first set. */\n {% set existing_relation = load_cached_relation(existing_relation) %}\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% do create_indexes(target_relation) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.get_create_table_as_sql", "macro.dbt.create_indexes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6502678, "supported_languages": ["sql"]}, "macro.dbt.get_create_table_as_sql": {"name": "get_create_table_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.get_create_table_as_sql", "macro_sql": "{% macro get_create_table_as_sql(temporary, relation, sql) -%}\n {{ adapter.dispatch('get_create_table_as_sql', 'dbt')(temporary, relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_table_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.651077, "supported_languages": null}, "macro.dbt.default__get_create_table_as_sql": {"name": "default__get_create_table_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_create_table_as_sql", "macro_sql": "{% macro default__get_create_table_as_sql(temporary, relation, sql) -%}\n {{ return(create_table_as(temporary, relation, sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6512482, "supported_languages": null}, "macro.dbt.create_table_as": {"name": "create_table_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.create_table_as", "macro_sql": "{% macro create_table_as(temporary, relation, compiled_code, language='sql') -%}\n {# backward compatibility for create_table_as that does not support language #}\n {% if language == \"sql\" %}\n {{ adapter.dispatch('create_table_as', 'dbt')(temporary, relation, compiled_code)}}\n {% else %}\n {{ adapter.dispatch('create_table_as', 'dbt')(temporary, relation, compiled_code, language) }}\n {% endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.651652, "supported_languages": null}, "macro.dbt.default__create_table_as": {"name": "default__create_table_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__create_table_as", "macro_sql": "{% macro default__create_table_as(temporary, relation, sql) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n\n create {% if temporary: -%}temporary{%- endif %} table\n {{ relation.include(database=(not temporary), schema=(not temporary)) }}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(sql) }}\n {{ get_table_columns_and_constraints() }}\n {%- set sql = get_select_subquery(sql) %}\n {% endif %}\n as (\n {{ sql }}\n );\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent", "macro.dbt.get_table_columns_and_constraints", "macro.dbt.get_select_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.652267, "supported_languages": null}, "macro.dbt.default__get_column_names": {"name": "default__get_column_names", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_column_names", "macro_sql": "{% macro default__get_column_names() %}\n {#- loop through user_provided_columns to get column names -#}\n {%- set user_provided_columns = model['columns'] -%}\n {%- for i in user_provided_columns %}\n {%- set col = user_provided_columns[i] -%}\n {%- set col_name = adapter.quote(col['name']) if col.get('quote') else col['name'] -%}\n {{ col_name }}{{ \", \" if not loop.last }}\n {%- endfor -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.652705, "supported_languages": null}, "macro.dbt.get_select_subquery": {"name": "get_select_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.get_select_subquery", "macro_sql": "{% macro get_select_subquery(sql) %}\n {{ return(adapter.dispatch('get_select_subquery', 'dbt')(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_select_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.652878, "supported_languages": null}, "macro.dbt.default__get_select_subquery": {"name": "default__get_select_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_select_subquery", "macro_sql": "{% macro default__get_select_subquery(sql) %}\n select {{ adapter.dispatch('get_column_names', 'dbt')() }}\n from (\n {{ sql }}\n ) as model_subq\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.get_column_names", "macro.dbt.default__get_column_names"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.653041, "supported_languages": null}, "macro.dbt.materialization_view_default": {"name": "materialization_view_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/view.sql", "original_file_path": "macros/materializations/models/view/view.sql", "unique_id": "macro.dbt.materialization_view_default", "macro_sql": "{%- materialization view, default -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='view') -%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n This relation (probably) doesn't exist yet. If it does exist, it's a leftover from\n a previous run, and we're going to try to drop it immediately. At the end of this\n materialization, we're going to rename the \"existing_relation\" to this identifier,\n and then we're going to drop it. In order to make sure we run the correct one of:\n - drop view ...\n - drop table ...\n\n We need to set the type of this relation to be the type of the existing_relation, if it exists,\n or else \"view\" as a sane default if it does not. Note that if the existing_relation does not\n exist, then there is nothing to move out of the way and subsequentally drop. In that case,\n this relation will be effectively unused.\n */\n {%- set backup_relation_type = 'view' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_view_as_sql(intermediate_relation, sql) }}\n {%- endcall %}\n\n -- cleanup\n -- move the existing view out of the way\n {% if existing_relation is not none %}\n /* Do the equivalent of rename_if_exists. 'existing_relation' could have been dropped\n since the variable was first set. */\n {% set existing_relation = load_cached_relation(existing_relation) %}\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n {% endif %}\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.run_hooks", "macro.dbt.drop_relation_if_exists", "macro.dbt.statement", "macro.dbt.get_create_view_as_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.655742, "supported_languages": ["sql"]}, "macro.dbt.handle_existing_table": {"name": "handle_existing_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/helpers.sql", "original_file_path": "macros/materializations/models/view/helpers.sql", "unique_id": "macro.dbt.handle_existing_table", "macro_sql": "{% macro handle_existing_table(full_refresh, old_relation) %}\n {{ adapter.dispatch('handle_existing_table', 'dbt')(full_refresh, old_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__handle_existing_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.656019, "supported_languages": null}, "macro.dbt.default__handle_existing_table": {"name": "default__handle_existing_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/helpers.sql", "original_file_path": "macros/materializations/models/view/helpers.sql", "unique_id": "macro.dbt.default__handle_existing_table", "macro_sql": "{% macro default__handle_existing_table(full_refresh, old_relation) %}\n {{ log(\"Dropping relation \" ~ old_relation ~ \" because it is of type \" ~ old_relation.type) }}\n {{ adapter.drop_relation(old_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.656225, "supported_languages": null}, "macro.dbt.create_or_replace_view": {"name": "create_or_replace_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_or_replace_view.sql", "original_file_path": "macros/materializations/models/view/create_or_replace_view.sql", "unique_id": "macro.dbt.create_or_replace_view", "macro_sql": "{% macro create_or_replace_view() %}\n {%- set identifier = model['alias'] -%}\n\n {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}\n {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%}\n\n {%- set target_relation = api.Relation.create(\n identifier=identifier, schema=schema, database=database,\n type='view') -%}\n {% set grant_config = config.get('grants') %}\n\n {{ run_hooks(pre_hooks) }}\n\n -- If there's a table with the same name and we weren't told to full refresh,\n -- that's an error. If we were told to full refresh, drop it. This behavior differs\n -- for Snowflake and BigQuery, so multiple dispatch is used.\n {%- if old_relation is not none and old_relation.is_table -%}\n {{ handle_existing_table(should_full_refresh(), old_relation) }}\n {%- endif -%}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_view_as_sql(target_relation, sql) }}\n {%- endcall %}\n\n {% set should_revoke = should_revoke(exists_as_view, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {{ run_hooks(post_hooks) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_hooks", "macro.dbt.handle_existing_table", "macro.dbt.should_full_refresh", "macro.dbt.statement", "macro.dbt.get_create_view_as_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.657619, "supported_languages": null}, "macro.dbt.get_create_view_as_sql": {"name": "get_create_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.get_create_view_as_sql", "macro_sql": "{% macro get_create_view_as_sql(relation, sql) -%}\n {{ adapter.dispatch('get_create_view_as_sql', 'dbt')(relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.658005, "supported_languages": null}, "macro.dbt.default__get_create_view_as_sql": {"name": "default__get_create_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.default__get_create_view_as_sql", "macro_sql": "{% macro default__get_create_view_as_sql(relation, sql) -%}\n {{ return(create_view_as(relation, sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_view_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.658157, "supported_languages": null}, "macro.dbt.create_view_as": {"name": "create_view_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.create_view_as", "macro_sql": "{% macro create_view_as(relation, sql) -%}\n {{ adapter.dispatch('create_view_as', 'dbt')(relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_view_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.658328, "supported_languages": null}, "macro.dbt.default__create_view_as": {"name": "default__create_view_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.default__create_view_as", "macro_sql": "{% macro default__create_view_as(relation, sql) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n create view {{ relation }}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(sql) }}\n {%- endif %}\n as (\n {{ sql }}\n );\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.65872, "supported_languages": null}, "macro.dbt.materialization_seed_default": {"name": "materialization_seed_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/seed.sql", "original_file_path": "macros/materializations/seeds/seed.sql", "unique_id": "macro.dbt.materialization_seed_default", "macro_sql": "{% materialization seed, default %}\n\n {%- set identifier = model['alias'] -%}\n {%- set full_refresh_mode = (should_full_refresh()) -%}\n\n {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}\n\n {%- set exists_as_table = (old_relation is not none and old_relation.is_table) -%}\n {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%}\n\n {%- set grant_config = config.get('grants') -%}\n {%- set agate_table = load_agate_table() -%}\n -- grab current tables grants config for comparison later on\n\n {%- do store_result('agate_table', response='OK', agate_table=agate_table) -%}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% set create_table_sql = \"\" %}\n {% if exists_as_view %}\n {{ exceptions.raise_compiler_error(\"Cannot seed to '{}', it is a view\".format(old_relation)) }}\n {% elif exists_as_table %}\n {% set create_table_sql = reset_csv_table(model, full_refresh_mode, old_relation, agate_table) %}\n {% else %}\n {% set create_table_sql = create_csv_table(model, agate_table) %}\n {% endif %}\n\n {% set code = 'CREATE' if full_refresh_mode else 'INSERT' %}\n {% set rows_affected = (agate_table.rows | length) %}\n {% set sql = load_csv_rows(model, agate_table) %}\n\n {% call noop_statement('main', code ~ ' ' ~ rows_affected, code, rows_affected) %}\n {{ get_csv_sql(create_table_sql, sql) }};\n {% endcall %}\n\n {% set target_relation = this.incorporate(type='table') %}\n\n {% set should_revoke = should_revoke(old_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if full_refresh_mode or not exists_as_table %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh", "macro.dbt.run_hooks", "macro.dbt.reset_csv_table", "macro.dbt.create_csv_table", "macro.dbt.load_csv_rows", "macro.dbt.noop_statement", "macro.dbt.get_csv_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.661781, "supported_languages": ["sql"]}, "macro.dbt.create_csv_table": {"name": "create_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.create_csv_table", "macro_sql": "{% macro create_csv_table(model, agate_table) -%}\n {{ adapter.dispatch('create_csv_table', 'dbt')(model, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6666749, "supported_languages": null}, "macro.dbt.default__create_csv_table": {"name": "default__create_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__create_csv_table", "macro_sql": "{% macro default__create_csv_table(model, agate_table) %}\n {%- set column_override = model['config'].get('column_types', {}) -%}\n {%- set quote_seed_column = model['config'].get('quote_columns', None) -%}\n\n {% set sql %}\n create table {{ this.render() }} (\n {%- for col_name in agate_table.column_names -%}\n {%- set inferred_type = adapter.convert_type(agate_table, loop.index0) -%}\n {%- set type = column_override.get(col_name, inferred_type) -%}\n {%- set column_name = (col_name | string) -%}\n {{ adapter.quote_seed_column(column_name, quote_seed_column) }} {{ type }} {%- if not loop.last -%}, {%- endif -%}\n {%- endfor -%}\n )\n {% endset %}\n\n {% call statement('_') -%}\n {{ sql }}\n {%- endcall %}\n\n {{ return(sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.667525, "supported_languages": null}, "macro.dbt.reset_csv_table": {"name": "reset_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.reset_csv_table", "macro_sql": "{% macro reset_csv_table(model, full_refresh, old_relation, agate_table) -%}\n {{ adapter.dispatch('reset_csv_table', 'dbt')(model, full_refresh, old_relation, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__reset_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.667744, "supported_languages": null}, "macro.dbt.default__reset_csv_table": {"name": "default__reset_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__reset_csv_table", "macro_sql": "{% macro default__reset_csv_table(model, full_refresh, old_relation, agate_table) %}\n {% set sql = \"\" %}\n {% if full_refresh %}\n {{ adapter.drop_relation(old_relation) }}\n {% set sql = create_csv_table(model, agate_table) %}\n {% else %}\n {{ adapter.truncate_relation(old_relation) }}\n {% set sql = \"truncate table \" ~ old_relation %}\n {% endif %}\n\n {{ return(sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.668195, "supported_languages": null}, "macro.dbt.get_csv_sql": {"name": "get_csv_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_csv_sql", "macro_sql": "{% macro get_csv_sql(create_or_truncate_sql, insert_sql) %}\n {{ adapter.dispatch('get_csv_sql', 'dbt')(create_or_truncate_sql, insert_sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_csv_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.668378, "supported_languages": null}, "macro.dbt.default__get_csv_sql": {"name": "default__get_csv_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_csv_sql", "macro_sql": "{% macro default__get_csv_sql(create_or_truncate_sql, insert_sql) %}\n {{ create_or_truncate_sql }};\n -- dbt seed --\n {{ insert_sql }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6685, "supported_languages": null}, "macro.dbt.get_binding_char": {"name": "get_binding_char", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_binding_char", "macro_sql": "{% macro get_binding_char() -%}\n {{ adapter.dispatch('get_binding_char', 'dbt')() }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6686468, "supported_languages": null}, "macro.dbt.default__get_binding_char": {"name": "default__get_binding_char", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_binding_char", "macro_sql": "{% macro default__get_binding_char() %}\n {{ return('%s') }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.668751, "supported_languages": null}, "macro.dbt.get_batch_size": {"name": "get_batch_size", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_batch_size", "macro_sql": "{% macro get_batch_size() -%}\n {{ return(adapter.dispatch('get_batch_size', 'dbt')()) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_batch_size"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.668907, "supported_languages": null}, "macro.dbt.default__get_batch_size": {"name": "default__get_batch_size", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_batch_size", "macro_sql": "{% macro default__get_batch_size() %}\n {{ return(10000) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.669013, "supported_languages": null}, "macro.dbt.get_seed_column_quoted_csv": {"name": "get_seed_column_quoted_csv", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_seed_column_quoted_csv", "macro_sql": "{% macro get_seed_column_quoted_csv(model, column_names) %}\n {%- set quote_seed_column = model['config'].get('quote_columns', None) -%}\n {% set quoted = [] %}\n {% for col in column_names -%}\n {%- do quoted.append(adapter.quote_seed_column(col, quote_seed_column)) -%}\n {%- endfor %}\n\n {%- set dest_cols_csv = quoted | join(', ') -%}\n {{ return(dest_cols_csv) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.669464, "supported_languages": null}, "macro.dbt.load_csv_rows": {"name": "load_csv_rows", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.load_csv_rows", "macro_sql": "{% macro load_csv_rows(model, agate_table) -%}\n {{ adapter.dispatch('load_csv_rows', 'dbt')(model, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__load_csv_rows"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.669634, "supported_languages": null}, "macro.dbt.default__load_csv_rows": {"name": "default__load_csv_rows", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__load_csv_rows", "macro_sql": "{% macro default__load_csv_rows(model, agate_table) %}\n\n {% set batch_size = get_batch_size() %}\n\n {% set cols_sql = get_seed_column_quoted_csv(model, agate_table.column_names) %}\n {% set bindings = [] %}\n\n {% set statements = [] %}\n\n {% for chunk in agate_table.rows | batch(batch_size) %}\n {% set bindings = [] %}\n\n {% for row in chunk %}\n {% do bindings.extend(row) %}\n {% endfor %}\n\n {% set sql %}\n insert into {{ this.render() }} ({{ cols_sql }}) values\n {% for row in chunk -%}\n ({%- for column in agate_table.column_names -%}\n {{ get_binding_char() }}\n {%- if not loop.last%},{%- endif %}\n {%- endfor -%})\n {%- if not loop.last%},{%- endif %}\n {%- endfor %}\n {% endset %}\n\n {% do adapter.add_query(sql, bindings=bindings, abridge_sql_log=True) %}\n\n {% if loop.index0 == 0 %}\n {% do statements.append(sql) %}\n {% endif %}\n {% endfor %}\n\n {# Return SQL so we can render it out into the compiled files #}\n {{ return(statements[0]) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_batch_size", "macro.dbt.get_seed_column_quoted_csv", "macro.dbt.get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.67082, "supported_languages": null}, "macro.dbt.generate_alias_name": {"name": "generate_alias_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_alias.sql", "original_file_path": "macros/get_custom_name/get_custom_alias.sql", "unique_id": "macro.dbt.generate_alias_name", "macro_sql": "{% macro generate_alias_name(custom_alias_name=none, node=none) -%}\n {% do return(adapter.dispatch('generate_alias_name', 'dbt')(custom_alias_name, node)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_alias_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6712132, "supported_languages": null}, "macro.dbt.default__generate_alias_name": {"name": "default__generate_alias_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_alias.sql", "original_file_path": "macros/get_custom_name/get_custom_alias.sql", "unique_id": "macro.dbt.default__generate_alias_name", "macro_sql": "{% macro default__generate_alias_name(custom_alias_name=none, node=none) -%}\n\n {%- if custom_alias_name -%}\n\n {{ custom_alias_name | trim }}\n\n {%- elif node.version -%}\n\n {{ return(node.name ~ \"_v\" ~ (node.version | replace(\".\", \"_\"))) }}\n\n {%- else -%}\n\n {{ node.name }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.671562, "supported_languages": null}, "macro.dbt.generate_schema_name": {"name": "generate_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.generate_schema_name", "macro_sql": "{% macro generate_schema_name(custom_schema_name=none, node=none) -%}\n {{ return(adapter.dispatch('generate_schema_name', 'dbt')(custom_schema_name, node)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_schema_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.672053, "supported_languages": null}, "macro.dbt.default__generate_schema_name": {"name": "default__generate_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.default__generate_schema_name", "macro_sql": "{% macro default__generate_schema_name(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if custom_schema_name is none -%}\n\n {{ default_schema }}\n\n {%- else -%}\n\n {{ default_schema }}_{{ custom_schema_name | trim }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6722932, "supported_languages": null}, "macro.dbt.generate_schema_name_for_env": {"name": "generate_schema_name_for_env", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.generate_schema_name_for_env", "macro_sql": "{% macro generate_schema_name_for_env(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if target.name == 'prod' and custom_schema_name is not none -%}\n\n {{ custom_schema_name | trim }}\n\n {%- else -%}\n\n {{ default_schema }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.672556, "supported_languages": null}, "macro.dbt.generate_database_name": {"name": "generate_database_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_database.sql", "original_file_path": "macros/get_custom_name/get_custom_database.sql", "unique_id": "macro.dbt.generate_database_name", "macro_sql": "{% macro generate_database_name(custom_database_name=none, node=none) -%}\n {% do return(adapter.dispatch('generate_database_name', 'dbt')(custom_database_name, node)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_database_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.672917, "supported_languages": null}, "macro.dbt.default__generate_database_name": {"name": "default__generate_database_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_database.sql", "original_file_path": "macros/get_custom_name/get_custom_database.sql", "unique_id": "macro.dbt.default__generate_database_name", "macro_sql": "{% macro default__generate_database_name(custom_database_name=none, node=none) -%}\n {%- set default_database = target.database -%}\n {%- if custom_database_name is none -%}\n\n {{ default_database }}\n\n {%- else -%}\n\n {{ custom_database_name }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.67315, "supported_languages": null}, "macro.dbt.default__test_relationships": {"name": "default__test_relationships", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/relationships.sql", "original_file_path": "macros/generic_test_sql/relationships.sql", "unique_id": "macro.dbt.default__test_relationships", "macro_sql": "{% macro default__test_relationships(model, column_name, to, field) %}\n\nwith child as (\n select {{ column_name }} as from_field\n from {{ model }}\n where {{ column_name }} is not null\n),\n\nparent as (\n select {{ field }} as to_field\n from {{ to }}\n)\n\nselect\n from_field\n\nfrom child\nleft join parent\n on child.from_field = parent.to_field\n\nwhere parent.to_field is null\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6734662, "supported_languages": null}, "macro.dbt.default__test_not_null": {"name": "default__test_not_null", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/not_null.sql", "original_file_path": "macros/generic_test_sql/not_null.sql", "unique_id": "macro.dbt.default__test_not_null", "macro_sql": "{% macro default__test_not_null(model, column_name) %}\n\n{% set column_list = '*' if should_store_failures() else column_name %}\n\nselect {{ column_list }}\nfrom {{ model }}\nwhere {{ column_name }} is null\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_store_failures"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.673737, "supported_languages": null}, "macro.dbt.default__test_unique": {"name": "default__test_unique", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/unique.sql", "original_file_path": "macros/generic_test_sql/unique.sql", "unique_id": "macro.dbt.default__test_unique", "macro_sql": "{% macro default__test_unique(model, column_name) %}\n\nselect\n {{ column_name }} as unique_field,\n count(*) as n_records\n\nfrom {{ model }}\nwhere {{ column_name }} is not null\ngroup by {{ column_name }}\nhaving count(*) > 1\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.673961, "supported_languages": null}, "macro.dbt.default__test_accepted_values": {"name": "default__test_accepted_values", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/accepted_values.sql", "original_file_path": "macros/generic_test_sql/accepted_values.sql", "unique_id": "macro.dbt.default__test_accepted_values", "macro_sql": "{% macro default__test_accepted_values(model, column_name, values, quote=True) %}\n\nwith all_values as (\n\n select\n {{ column_name }} as value_field,\n count(*) as n_records\n\n from {{ model }}\n group by {{ column_name }}\n\n)\n\nselect *\nfrom all_values\nwhere value_field not in (\n {% for value in values -%}\n {% if quote -%}\n '{{ value }}'\n {%- else -%}\n {{ value }}\n {%- endif -%}\n {%- if not loop.last -%},{%- endif %}\n {%- endfor %}\n)\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.674487, "supported_languages": null}, "macro.dbt.statement": {"name": "statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.statement", "macro_sql": "\n{%- macro statement(name=None, fetch_result=False, auto_begin=True, language='sql') -%}\n {%- if execute: -%}\n {%- set compiled_code = caller() -%}\n\n {%- if name == 'main' -%}\n {{ log('Writing runtime {} for node \"{}\"'.format(language, model['unique_id'])) }}\n {{ write(compiled_code) }}\n {%- endif -%}\n {%- if language == 'sql'-%}\n {%- set res, table = adapter.execute(compiled_code, auto_begin=auto_begin, fetch=fetch_result) -%}\n {%- elif language == 'python' -%}\n {%- set res = submit_python_job(model, compiled_code) -%}\n {#-- TODO: What should table be for python models? --#}\n {%- set table = None -%}\n {%- else -%}\n {% do exceptions.raise_compiler_error(\"statement macro didn't get supported language\") %}\n {%- endif -%}\n\n {%- if name is not none -%}\n {{ store_result(name, response=res, agate_table=table) }}\n {%- endif -%}\n\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6758661, "supported_languages": null}, "macro.dbt.noop_statement": {"name": "noop_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.noop_statement", "macro_sql": "{% macro noop_statement(name=None, message=None, code=None, rows_affected=None, res=None) -%}\n {%- set sql = caller() -%}\n\n {%- if name == 'main' -%}\n {{ log('Writing runtime SQL for node \"{}\"'.format(model['unique_id'])) }}\n {{ write(sql) }}\n {%- endif -%}\n\n {%- if name is not none -%}\n {{ store_raw_result(name, message=message, code=code, rows_affected=rows_affected, agate_table=res) }}\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.676471, "supported_languages": null}, "macro.dbt.run_query": {"name": "run_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.run_query", "macro_sql": "{% macro run_query(sql) %}\n {% call statement(\"run_query_statement\", fetch_result=true, auto_begin=false) %}\n {{ sql }}\n {% endcall %}\n\n {% do return(load_result(\"run_query_statement\").table) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.676744, "supported_languages": null}, "macro.dbt.convert_datetime": {"name": "convert_datetime", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.convert_datetime", "macro_sql": "{% macro convert_datetime(date_str, date_fmt) %}\n\n {% set error_msg -%}\n The provided partition date '{{ date_str }}' does not match the expected format '{{ date_fmt }}'\n {%- endset %}\n\n {% set res = try_or_compiler_error(error_msg, modules.datetime.datetime.strptime, date_str.strip(), date_fmt) %}\n {{ return(res) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6785002, "supported_languages": null}, "macro.dbt.dates_in_range": {"name": "dates_in_range", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.dates_in_range", "macro_sql": "{% macro dates_in_range(start_date_str, end_date_str=none, in_fmt=\"%Y%m%d\", out_fmt=\"%Y%m%d\") %}\n {% set end_date_str = start_date_str if end_date_str is none else end_date_str %}\n\n {% set start_date = convert_datetime(start_date_str, in_fmt) %}\n {% set end_date = convert_datetime(end_date_str, in_fmt) %}\n\n {% set day_count = (end_date - start_date).days %}\n {% if day_count < 0 %}\n {% set msg -%}\n Partiton start date is after the end date ({{ start_date }}, {{ end_date }})\n {%- endset %}\n\n {{ exceptions.raise_compiler_error(msg, model) }}\n {% endif %}\n\n {% set date_list = [] %}\n {% for i in range(0, day_count + 1) %}\n {% set the_date = (modules.datetime.timedelta(days=i) + start_date) %}\n {% if not out_fmt %}\n {% set _ = date_list.append(the_date) %}\n {% else %}\n {% set _ = date_list.append(the_date.strftime(out_fmt)) %}\n {% endif %}\n {% endfor %}\n\n {{ return(date_list) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.convert_datetime"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.679612, "supported_languages": null}, "macro.dbt.partition_range": {"name": "partition_range", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.partition_range", "macro_sql": "{% macro partition_range(raw_partition_date, date_fmt='%Y%m%d') %}\n {% set partition_range = (raw_partition_date | string).split(\",\") %}\n\n {% if (partition_range | length) == 1 %}\n {% set start_date = partition_range[0] %}\n {% set end_date = none %}\n {% elif (partition_range | length) == 2 %}\n {% set start_date = partition_range[0] %}\n {% set end_date = partition_range[1] %}\n {% else %}\n {{ exceptions.raise_compiler_error(\"Invalid partition time. Expected format: {Start Date}[,{End Date}]. Got: \" ~ raw_partition_date) }}\n {% endif %}\n\n {{ return(dates_in_range(start_date, end_date, in_fmt=date_fmt)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.dates_in_range"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.680309, "supported_languages": null}, "macro.dbt.py_current_timestring": {"name": "py_current_timestring", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.py_current_timestring", "macro_sql": "{% macro py_current_timestring() %}\n {% set dt = modules.datetime.datetime.now() %}\n {% do return(dt.strftime(\"%Y%m%d%H%M%S%f\")) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.680523, "supported_languages": null}, "macro.dbt.except": {"name": "except", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/except.sql", "original_file_path": "macros/utils/except.sql", "unique_id": "macro.dbt.except", "macro_sql": "{% macro except() %}\n {{ return(adapter.dispatch('except', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__except"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6807308, "supported_languages": null}, "macro.dbt.default__except": {"name": "default__except", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/except.sql", "original_file_path": "macros/utils/except.sql", "unique_id": "macro.dbt.default__except", "macro_sql": "{% macro default__except() %}\n\n except\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.680801, "supported_languages": null}, "macro.dbt.replace": {"name": "replace", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/replace.sql", "original_file_path": "macros/utils/replace.sql", "unique_id": "macro.dbt.replace", "macro_sql": "{% macro replace(field, old_chars, new_chars) -%}\n {{ return(adapter.dispatch('replace', 'dbt') (field, old_chars, new_chars)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__replace"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.681101, "supported_languages": null}, "macro.dbt.default__replace": {"name": "default__replace", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/replace.sql", "original_file_path": "macros/utils/replace.sql", "unique_id": "macro.dbt.default__replace", "macro_sql": "{% macro default__replace(field, old_chars, new_chars) %}\n\n replace(\n {{ field }},\n {{ old_chars }},\n {{ new_chars }}\n )\n\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6812558, "supported_languages": null}, "macro.dbt.concat": {"name": "concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/concat.sql", "original_file_path": "macros/utils/concat.sql", "unique_id": "macro.dbt.concat", "macro_sql": "{% macro concat(fields) -%}\n {{ return(adapter.dispatch('concat', 'dbt')(fields)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__concat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.681481, "supported_languages": null}, "macro.dbt.default__concat": {"name": "default__concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/concat.sql", "original_file_path": "macros/utils/concat.sql", "unique_id": "macro.dbt.default__concat", "macro_sql": "{% macro default__concat(fields) -%}\n {{ fields|join(' || ') }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.681612, "supported_languages": null}, "macro.dbt.length": {"name": "length", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/length.sql", "original_file_path": "macros/utils/length.sql", "unique_id": "macro.dbt.length", "macro_sql": "{% macro length(expression) -%}\n {{ return(adapter.dispatch('length', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__length"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.681844, "supported_languages": null}, "macro.dbt.default__length": {"name": "default__length", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/length.sql", "original_file_path": "macros/utils/length.sql", "unique_id": "macro.dbt.default__length", "macro_sql": "{% macro default__length(expression) %}\n\n length(\n {{ expression }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6819599, "supported_languages": null}, "macro.dbt.dateadd": {"name": "dateadd", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt.dateadd", "macro_sql": "{% macro dateadd(datepart, interval, from_date_or_timestamp) %}\n {{ return(adapter.dispatch('dateadd', 'dbt')(datepart, interval, from_date_or_timestamp)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__dateadd"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.682301, "supported_languages": null}, "macro.dbt.default__dateadd": {"name": "default__dateadd", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt.default__dateadd", "macro_sql": "{% macro default__dateadd(datepart, interval, from_date_or_timestamp) %}\n\n dateadd(\n {{ datepart }},\n {{ interval }},\n {{ from_date_or_timestamp }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.682455, "supported_languages": null}, "macro.dbt.intersect": {"name": "intersect", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/intersect.sql", "original_file_path": "macros/utils/intersect.sql", "unique_id": "macro.dbt.intersect", "macro_sql": "{% macro intersect() %}\n {{ return(adapter.dispatch('intersect', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__intersect"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.682666, "supported_languages": null}, "macro.dbt.default__intersect": {"name": "default__intersect", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/intersect.sql", "original_file_path": "macros/utils/intersect.sql", "unique_id": "macro.dbt.default__intersect", "macro_sql": "{% macro default__intersect() %}\n\n intersect\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.682739, "supported_languages": null}, "macro.dbt.escape_single_quotes": {"name": "escape_single_quotes", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/escape_single_quotes.sql", "original_file_path": "macros/utils/escape_single_quotes.sql", "unique_id": "macro.dbt.escape_single_quotes", "macro_sql": "{% macro escape_single_quotes(expression) %}\n {{ return(adapter.dispatch('escape_single_quotes', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__escape_single_quotes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.68298, "supported_languages": null}, "macro.dbt.default__escape_single_quotes": {"name": "default__escape_single_quotes", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/escape_single_quotes.sql", "original_file_path": "macros/utils/escape_single_quotes.sql", "unique_id": "macro.dbt.default__escape_single_quotes", "macro_sql": "{% macro default__escape_single_quotes(expression) -%}\n{{ expression | replace(\"'\",\"''\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6831071, "supported_languages": null}, "macro.dbt.right": {"name": "right", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/right.sql", "original_file_path": "macros/utils/right.sql", "unique_id": "macro.dbt.right", "macro_sql": "{% macro right(string_text, length_expression) -%}\n {{ return(adapter.dispatch('right', 'dbt') (string_text, length_expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__right"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.683378, "supported_languages": null}, "macro.dbt.default__right": {"name": "default__right", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/right.sql", "original_file_path": "macros/utils/right.sql", "unique_id": "macro.dbt.default__right", "macro_sql": "{% macro default__right(string_text, length_expression) %}\n\n right(\n {{ string_text }},\n {{ length_expression }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.683563, "supported_languages": null}, "macro.dbt.listagg": {"name": "listagg", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt.listagg", "macro_sql": "{% macro listagg(measure, delimiter_text=\"','\", order_by_clause=none, limit_num=none) -%}\n {{ return(adapter.dispatch('listagg', 'dbt') (measure, delimiter_text, order_by_clause, limit_num)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__listagg"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.684159, "supported_languages": null}, "macro.dbt.default__listagg": {"name": "default__listagg", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt.default__listagg", "macro_sql": "{% macro default__listagg(measure, delimiter_text, order_by_clause, limit_num) -%}\n\n {% if limit_num -%}\n array_to_string(\n array_slice(\n array_agg(\n {{ measure }}\n ){% if order_by_clause -%}\n within group ({{ order_by_clause }})\n {%- endif %}\n ,0\n ,{{ limit_num }}\n ),\n {{ delimiter_text }}\n )\n {%- else %}\n listagg(\n {{ measure }},\n {{ delimiter_text }}\n )\n {% if order_by_clause -%}\n within group ({{ order_by_clause }})\n {%- endif %}\n {%- endif %}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.684532, "supported_languages": null}, "macro.dbt.datediff": {"name": "datediff", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt.datediff", "macro_sql": "{% macro datediff(first_date, second_date, datepart) %}\n {{ return(adapter.dispatch('datediff', 'dbt')(first_date, second_date, datepart)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__datediff"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.684841, "supported_languages": null}, "macro.dbt.default__datediff": {"name": "default__datediff", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt.default__datediff", "macro_sql": "{% macro default__datediff(first_date, second_date, datepart) -%}\n\n datediff(\n {{ datepart }},\n {{ first_date }},\n {{ second_date }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6849952, "supported_languages": null}, "macro.dbt.safe_cast": {"name": "safe_cast", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/safe_cast.sql", "original_file_path": "macros/utils/safe_cast.sql", "unique_id": "macro.dbt.safe_cast", "macro_sql": "{% macro safe_cast(field, type) %}\n {{ return(adapter.dispatch('safe_cast', 'dbt') (field, type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__safe_cast"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.685287, "supported_languages": null}, "macro.dbt.default__safe_cast": {"name": "default__safe_cast", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/safe_cast.sql", "original_file_path": "macros/utils/safe_cast.sql", "unique_id": "macro.dbt.default__safe_cast", "macro_sql": "{% macro default__safe_cast(field, type) %}\n {# most databases don't support this function yet\n so we just need to use cast #}\n cast({{field}} as {{type}})\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.685416, "supported_languages": null}, "macro.dbt.hash": {"name": "hash", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/hash.sql", "original_file_path": "macros/utils/hash.sql", "unique_id": "macro.dbt.hash", "macro_sql": "{% macro hash(field) -%}\n {{ return(adapter.dispatch('hash', 'dbt') (field)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__hash"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.685654, "supported_languages": null}, "macro.dbt.default__hash": {"name": "default__hash", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/hash.sql", "original_file_path": "macros/utils/hash.sql", "unique_id": "macro.dbt.default__hash", "macro_sql": "{% macro default__hash(field) -%}\n md5(cast({{ field }} as {{ api.Column.translate_type('string') }}))\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.685799, "supported_languages": null}, "macro.dbt.cast_bool_to_text": {"name": "cast_bool_to_text", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/cast_bool_to_text.sql", "original_file_path": "macros/utils/cast_bool_to_text.sql", "unique_id": "macro.dbt.cast_bool_to_text", "macro_sql": "{% macro cast_bool_to_text(field) %}\n {{ adapter.dispatch('cast_bool_to_text', 'dbt') (field) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__cast_bool_to_text"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.686027, "supported_languages": null}, "macro.dbt.default__cast_bool_to_text": {"name": "default__cast_bool_to_text", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/cast_bool_to_text.sql", "original_file_path": "macros/utils/cast_bool_to_text.sql", "unique_id": "macro.dbt.default__cast_bool_to_text", "macro_sql": "{% macro default__cast_bool_to_text(field) %}\n cast({{ field }} as {{ api.Column.translate_type('string') }})\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.686177, "supported_languages": null}, "macro.dbt.any_value": {"name": "any_value", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt.any_value", "macro_sql": "{% macro any_value(expression) -%}\n {{ return(adapter.dispatch('any_value', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__any_value"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.686408, "supported_languages": null}, "macro.dbt.default__any_value": {"name": "default__any_value", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt.default__any_value", "macro_sql": "{% macro default__any_value(expression) -%}\n\n any_value({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6865048, "supported_languages": null}, "macro.dbt.position": {"name": "position", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/position.sql", "original_file_path": "macros/utils/position.sql", "unique_id": "macro.dbt.position", "macro_sql": "{% macro position(substring_text, string_text) -%}\n {{ return(adapter.dispatch('position', 'dbt') (substring_text, string_text)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__position"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.686778, "supported_languages": null}, "macro.dbt.default__position": {"name": "default__position", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/position.sql", "original_file_path": "macros/utils/position.sql", "unique_id": "macro.dbt.default__position", "macro_sql": "{% macro default__position(substring_text, string_text) %}\n\n position(\n {{ substring_text }} in {{ string_text }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.686901, "supported_languages": null}, "macro.dbt.string_literal": {"name": "string_literal", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/literal.sql", "original_file_path": "macros/utils/literal.sql", "unique_id": "macro.dbt.string_literal", "macro_sql": "{%- macro string_literal(value) -%}\n {{ return(adapter.dispatch('string_literal', 'dbt') (value)) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__string_literal"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.687121, "supported_languages": null}, "macro.dbt.default__string_literal": {"name": "default__string_literal", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/literal.sql", "original_file_path": "macros/utils/literal.sql", "unique_id": "macro.dbt.default__string_literal", "macro_sql": "{% macro default__string_literal(value) -%}\n '{{ value }}'\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6872451, "supported_languages": null}, "macro.dbt.type_string": {"name": "type_string", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_string", "macro_sql": "\n\n{%- macro type_string() -%}\n {{ return(adapter.dispatch('type_string', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_string"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.688097, "supported_languages": null}, "macro.dbt.default__type_string": {"name": "default__type_string", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_string", "macro_sql": "{% macro default__type_string() %}\n {{ return(api.Column.translate_type(\"string\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.688236, "supported_languages": null}, "macro.dbt.type_timestamp": {"name": "type_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_timestamp", "macro_sql": "\n\n{%- macro type_timestamp() -%}\n {{ return(adapter.dispatch('type_timestamp', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.688443, "supported_languages": null}, "macro.dbt.default__type_timestamp": {"name": "default__type_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_timestamp", "macro_sql": "{% macro default__type_timestamp() %}\n {{ return(api.Column.translate_type(\"timestamp\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.688582, "supported_languages": null}, "macro.dbt.type_float": {"name": "type_float", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_float", "macro_sql": "\n\n{%- macro type_float() -%}\n {{ return(adapter.dispatch('type_float', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_float"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.68873, "supported_languages": null}, "macro.dbt.default__type_float": {"name": "default__type_float", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_float", "macro_sql": "{% macro default__type_float() %}\n {{ return(api.Column.translate_type(\"float\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6888692, "supported_languages": null}, "macro.dbt.type_numeric": {"name": "type_numeric", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_numeric", "macro_sql": "\n\n{%- macro type_numeric() -%}\n {{ return(adapter.dispatch('type_numeric', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_numeric"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689018, "supported_languages": null}, "macro.dbt.default__type_numeric": {"name": "default__type_numeric", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_numeric", "macro_sql": "{% macro default__type_numeric() %}\n {{ return(api.Column.numeric_type(\"numeric\", 28, 6)) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689182, "supported_languages": null}, "macro.dbt.type_bigint": {"name": "type_bigint", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_bigint", "macro_sql": "\n\n{%- macro type_bigint() -%}\n {{ return(adapter.dispatch('type_bigint', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_bigint"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689331, "supported_languages": null}, "macro.dbt.default__type_bigint": {"name": "default__type_bigint", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_bigint", "macro_sql": "{% macro default__type_bigint() %}\n {{ return(api.Column.translate_type(\"bigint\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6894689, "supported_languages": null}, "macro.dbt.type_int": {"name": "type_int", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_int", "macro_sql": "\n\n{%- macro type_int() -%}\n {{ return(adapter.dispatch('type_int', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_int"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689625, "supported_languages": null}, "macro.dbt.default__type_int": {"name": "default__type_int", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_int", "macro_sql": "{%- macro default__type_int() -%}\n {{ return(api.Column.translate_type(\"integer\")) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689758, "supported_languages": null}, "macro.dbt.type_boolean": {"name": "type_boolean", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_boolean", "macro_sql": "\n\n{%- macro type_boolean() -%}\n {{ return(adapter.dispatch('type_boolean', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_boolean"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.689908, "supported_languages": null}, "macro.dbt.default__type_boolean": {"name": "default__type_boolean", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_boolean", "macro_sql": "{%- macro default__type_boolean() -%}\n {{ return(api.Column.translate_type(\"boolean\")) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.690038, "supported_languages": null}, "macro.dbt.array_concat": {"name": "array_concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_concat.sql", "original_file_path": "macros/utils/array_concat.sql", "unique_id": "macro.dbt.array_concat", "macro_sql": "{% macro array_concat(array_1, array_2) -%}\n {{ return(adapter.dispatch('array_concat', 'dbt')(array_1, array_2)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_concat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.690295, "supported_languages": null}, "macro.dbt.default__array_concat": {"name": "default__array_concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_concat.sql", "original_file_path": "macros/utils/array_concat.sql", "unique_id": "macro.dbt.default__array_concat", "macro_sql": "{% macro default__array_concat(array_1, array_2) -%}\n array_cat({{ array_1 }}, {{ array_2 }})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.690416, "supported_languages": null}, "macro.dbt.bool_or": {"name": "bool_or", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/bool_or.sql", "original_file_path": "macros/utils/bool_or.sql", "unique_id": "macro.dbt.bool_or", "macro_sql": "{% macro bool_or(expression) -%}\n {{ return(adapter.dispatch('bool_or', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__bool_or"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6906419, "supported_languages": null}, "macro.dbt.default__bool_or": {"name": "default__bool_or", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/bool_or.sql", "original_file_path": "macros/utils/bool_or.sql", "unique_id": "macro.dbt.default__bool_or", "macro_sql": "{% macro default__bool_or(expression) -%}\n\n bool_or({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.690739, "supported_languages": null}, "macro.dbt.last_day": {"name": "last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.last_day", "macro_sql": "{% macro last_day(date, datepart) %}\n {{ return(adapter.dispatch('last_day', 'dbt') (date, datepart)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6910589, "supported_languages": null}, "macro.dbt.default_last_day": {"name": "default_last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.default_last_day", "macro_sql": "\n\n{%- macro default_last_day(date, datepart) -%}\n cast(\n {{dbt.dateadd('day', '-1',\n dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date))\n )}}\n as date)\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.dateadd", "macro.dbt.date_trunc"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6913059, "supported_languages": null}, "macro.dbt.default__last_day": {"name": "default__last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.default__last_day", "macro_sql": "{% macro default__last_day(date, datepart) -%}\n {{dbt.default_last_day(date, datepart)}}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default_last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.691443, "supported_languages": null}, "macro.dbt.split_part": {"name": "split_part", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt.split_part", "macro_sql": "{% macro split_part(string_text, delimiter_text, part_number) %}\n {{ return(adapter.dispatch('split_part', 'dbt') (string_text, delimiter_text, part_number)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__split_part"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.691941, "supported_languages": null}, "macro.dbt.default__split_part": {"name": "default__split_part", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt.default__split_part", "macro_sql": "{% macro default__split_part(string_text, delimiter_text, part_number) %}\n\n split_part(\n {{ string_text }},\n {{ delimiter_text }},\n {{ part_number }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.692151, "supported_languages": null}, "macro.dbt._split_part_negative": {"name": "_split_part_negative", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt._split_part_negative", "macro_sql": "{% macro _split_part_negative(string_text, delimiter_text, part_number) %}\n\n split_part(\n {{ string_text }},\n {{ delimiter_text }},\n length({{ string_text }})\n - length(\n replace({{ string_text }}, {{ delimiter_text }}, '')\n ) + 2 + {{ part_number }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.692354, "supported_languages": null}, "macro.dbt.date_trunc": {"name": "date_trunc", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/date_trunc.sql", "original_file_path": "macros/utils/date_trunc.sql", "unique_id": "macro.dbt.date_trunc", "macro_sql": "{% macro date_trunc(datepart, date) -%}\n {{ return(adapter.dispatch('date_trunc', 'dbt') (datepart, date)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__date_trunc"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.692611, "supported_languages": null}, "macro.dbt.default__date_trunc": {"name": "default__date_trunc", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/date_trunc.sql", "original_file_path": "macros/utils/date_trunc.sql", "unique_id": "macro.dbt.default__date_trunc", "macro_sql": "{% macro default__date_trunc(datepart, date) -%}\n date_trunc('{{datepart}}', {{date}})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.692728, "supported_languages": null}, "macro.dbt.array_construct": {"name": "array_construct", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_construct.sql", "original_file_path": "macros/utils/array_construct.sql", "unique_id": "macro.dbt.array_construct", "macro_sql": "{% macro array_construct(inputs=[], data_type=api.Column.translate_type('integer')) -%}\n {{ return(adapter.dispatch('array_construct', 'dbt')(inputs, data_type)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_construct"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.693068, "supported_languages": null}, "macro.dbt.default__array_construct": {"name": "default__array_construct", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_construct.sql", "original_file_path": "macros/utils/array_construct.sql", "unique_id": "macro.dbt.default__array_construct", "macro_sql": "{% macro default__array_construct(inputs, data_type) -%}\n {% if inputs|length > 0 %}\n array[ {{ inputs|join(' , ') }} ]\n {% else %}\n array[]::{{data_type}}[]\n {% endif %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6932878, "supported_languages": null}, "macro.dbt.array_append": {"name": "array_append", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_append.sql", "original_file_path": "macros/utils/array_append.sql", "unique_id": "macro.dbt.array_append", "macro_sql": "{% macro array_append(array, new_element) -%}\n {{ return(adapter.dispatch('array_append', 'dbt')(array, new_element)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_append"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.693543, "supported_languages": null}, "macro.dbt.default__array_append": {"name": "default__array_append", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_append.sql", "original_file_path": "macros/utils/array_append.sql", "unique_id": "macro.dbt.default__array_append", "macro_sql": "{% macro default__array_append(array, new_element) -%}\n array_append({{ array }}, {{ new_element }})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.693664, "supported_languages": null}, "macro.dbt.create_schema": {"name": "create_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.create_schema", "macro_sql": "{% macro create_schema(relation) -%}\n {{ adapter.dispatch('create_schema', 'dbt')(relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_schema"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6940129, "supported_languages": null}, "macro.dbt.default__create_schema": {"name": "default__create_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.default__create_schema", "macro_sql": "{% macro default__create_schema(relation) -%}\n {%- call statement('create_schema') -%}\n create schema if not exists {{ relation.without_identifier() }}\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6941829, "supported_languages": null}, "macro.dbt.drop_schema": {"name": "drop_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.drop_schema", "macro_sql": "{% macro drop_schema(relation) -%}\n {{ adapter.dispatch('drop_schema', 'dbt')(relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__drop_schema"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.694331, "supported_languages": null}, "macro.dbt.default__drop_schema": {"name": "default__drop_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.default__drop_schema", "macro_sql": "{% macro default__drop_schema(relation) -%}\n {%- call statement('drop_schema') -%}\n drop schema if exists {{ relation.without_identifier() }} cascade\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6945002, "supported_languages": null}, "macro.dbt.current_timestamp": {"name": "current_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp", "macro_sql": "{%- macro current_timestamp() -%}\n {{ adapter.dispatch('current_timestamp', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.694944, "supported_languages": null}, "macro.dbt.default__current_timestamp": {"name": "default__current_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp", "macro_sql": "{% macro default__current_timestamp() -%}\n {{ exceptions.raise_not_implemented(\n 'current_timestamp macro not implemented for adapter ' + adapter.type()) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695081, "supported_languages": null}, "macro.dbt.snapshot_get_time": {"name": "snapshot_get_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.snapshot_get_time", "macro_sql": "\n\n{%- macro snapshot_get_time() -%}\n {{ adapter.dispatch('snapshot_get_time', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_get_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695213, "supported_languages": null}, "macro.dbt.default__snapshot_get_time": {"name": "default__snapshot_get_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__snapshot_get_time", "macro_sql": "{% macro default__snapshot_get_time() %}\n {{ current_timestamp() }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695308, "supported_languages": null}, "macro.dbt.current_timestamp_backcompat": {"name": "current_timestamp_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp_backcompat", "macro_sql": "{% macro current_timestamp_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__current_timestamp_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6954591, "supported_languages": null}, "macro.dbt.default__current_timestamp_backcompat": {"name": "default__current_timestamp_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp_backcompat", "macro_sql": "{% macro default__current_timestamp_backcompat() %}\n current_timestamp::timestamp\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695529, "supported_languages": null}, "macro.dbt.current_timestamp_in_utc_backcompat": {"name": "current_timestamp_in_utc_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp_in_utc_backcompat", "macro_sql": "{% macro current_timestamp_in_utc_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_in_utc_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__current_timestamp_in_utc_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695678, "supported_languages": null}, "macro.dbt.default__current_timestamp_in_utc_backcompat": {"name": "default__current_timestamp_in_utc_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp_in_utc_backcompat", "macro_sql": "{% macro default__current_timestamp_in_utc_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp_backcompat", "macro.dbt.default__current_timestamp_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.695893, "supported_languages": null}, "macro.dbt.get_create_index_sql": {"name": "get_create_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_create_index_sql", "macro_sql": "{% macro get_create_index_sql(relation, index_dict) -%}\n {{ return(adapter.dispatch('get_create_index_sql', 'dbt')(relation, index_dict)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_index_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.696683, "supported_languages": null}, "macro.dbt.default__get_create_index_sql": {"name": "default__get_create_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_create_index_sql", "macro_sql": "{% macro default__get_create_index_sql(relation, index_dict) -%}\n {% do return(None) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.696807, "supported_languages": null}, "macro.dbt.create_indexes": {"name": "create_indexes", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.create_indexes", "macro_sql": "{% macro create_indexes(relation) -%}\n {{ adapter.dispatch('create_indexes', 'dbt')(relation) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.6969512, "supported_languages": null}, "macro.dbt.default__create_indexes": {"name": "default__create_indexes", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__create_indexes", "macro_sql": "{% macro default__create_indexes(relation) -%}\n {%- set _indexes = config.get('indexes', default=[]) -%}\n\n {% for _index_dict in _indexes %}\n {% set create_index_sql = get_create_index_sql(relation, _index_dict) %}\n {% if create_index_sql %}\n {% do run_query(create_index_sql) %}\n {% endif %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_create_index_sql", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.697322, "supported_languages": null}, "macro.dbt.get_drop_index_sql": {"name": "get_drop_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_drop_index_sql", "macro_sql": "{% macro get_drop_index_sql(relation, index_name) -%}\n {{ adapter.dispatch('get_drop_index_sql', 'dbt')(relation, index_name) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_drop_index_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.697492, "supported_languages": null}, "macro.dbt.default__get_drop_index_sql": {"name": "default__get_drop_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_drop_index_sql", "macro_sql": "{% macro default__get_drop_index_sql(relation, index_name) -%}\n {{ exceptions.raise_compiler_error(\"`get_drop_index_sql has not been implemented for this adapter.\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.697619, "supported_languages": null}, "macro.dbt.get_show_indexes_sql": {"name": "get_show_indexes_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_show_indexes_sql", "macro_sql": "{% macro get_show_indexes_sql(relation) -%}\n {{ adapter.dispatch('get_show_indexes_sql', 'dbt')(relation) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_show_indexes_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.697763, "supported_languages": null}, "macro.dbt.default__get_show_indexes_sql": {"name": "default__get_show_indexes_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_show_indexes_sql", "macro_sql": "{% macro default__get_show_indexes_sql(relation) -%}\n {{ exceptions.raise_compiler_error(\"`get_show_indexes_sql has not been implemented for this adapter.\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.697883, "supported_languages": null}, "macro.dbt.make_intermediate_relation": {"name": "make_intermediate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_intermediate_relation", "macro_sql": "{% macro make_intermediate_relation(base_relation, suffix='__dbt_tmp') %}\n {{ return(adapter.dispatch('make_intermediate_relation', 'dbt')(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_intermediate_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7005382, "supported_languages": null}, "macro.dbt.default__make_intermediate_relation": {"name": "default__make_intermediate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_intermediate_relation", "macro_sql": "{% macro default__make_intermediate_relation(base_relation, suffix) %}\n {{ return(default__make_temp_relation(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_temp_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7006888, "supported_languages": null}, "macro.dbt.make_temp_relation": {"name": "make_temp_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_temp_relation", "macro_sql": "{% macro make_temp_relation(base_relation, suffix='__dbt_tmp') %}\n {{ return(adapter.dispatch('make_temp_relation', 'dbt')(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__make_temp_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.700892, "supported_languages": null}, "macro.dbt.default__make_temp_relation": {"name": "default__make_temp_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_temp_relation", "macro_sql": "{% macro default__make_temp_relation(base_relation, suffix) %}\n {%- set temp_identifier = base_relation.identifier ~ suffix -%}\n {%- set temp_relation = base_relation.incorporate(\n path={\"identifier\": temp_identifier}) -%}\n\n {{ return(temp_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.701169, "supported_languages": null}, "macro.dbt.make_backup_relation": {"name": "make_backup_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_backup_relation", "macro_sql": "{% macro make_backup_relation(base_relation, backup_relation_type, suffix='__dbt_backup') %}\n {{ return(adapter.dispatch('make_backup_relation', 'dbt')(base_relation, backup_relation_type, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_backup_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7013938, "supported_languages": null}, "macro.dbt.default__make_backup_relation": {"name": "default__make_backup_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_backup_relation", "macro_sql": "{% macro default__make_backup_relation(base_relation, backup_relation_type, suffix) %}\n {%- set backup_identifier = base_relation.identifier ~ suffix -%}\n {%- set backup_relation = base_relation.incorporate(\n path={\"identifier\": backup_identifier},\n type=backup_relation_type\n ) -%}\n {{ return(backup_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.701685, "supported_languages": null}, "macro.dbt.truncate_relation": {"name": "truncate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.truncate_relation", "macro_sql": "{% macro truncate_relation(relation) -%}\n {{ return(adapter.dispatch('truncate_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__truncate_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.701857, "supported_languages": null}, "macro.dbt.default__truncate_relation": {"name": "default__truncate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__truncate_relation", "macro_sql": "{% macro default__truncate_relation(relation) -%}\n {% call statement('truncate_relation') -%}\n truncate table {{ relation }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.702008, "supported_languages": null}, "macro.dbt.rename_relation": {"name": "rename_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.rename_relation", "macro_sql": "{% macro rename_relation(from_relation, to_relation) -%}\n {{ return(adapter.dispatch('rename_relation', 'dbt')(from_relation, to_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__rename_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.702192, "supported_languages": null}, "macro.dbt.default__rename_relation": {"name": "default__rename_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__rename_relation", "macro_sql": "{% macro default__rename_relation(from_relation, to_relation) -%}\n {% set target_name = adapter.quote_as_configured(to_relation.identifier, 'identifier') %}\n {% call statement('rename_relation') -%}\n alter table {{ from_relation }} rename to {{ target_name }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.702507, "supported_languages": null}, "macro.dbt.get_or_create_relation": {"name": "get_or_create_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.get_or_create_relation", "macro_sql": "{% macro get_or_create_relation(database, schema, identifier, type) -%}\n {{ return(adapter.dispatch('get_or_create_relation', 'dbt')(database, schema, identifier, type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_or_create_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.702741, "supported_languages": null}, "macro.dbt.default__get_or_create_relation": {"name": "default__get_or_create_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__get_or_create_relation", "macro_sql": "{% macro default__get_or_create_relation(database, schema, identifier, type) %}\n {%- set target_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) %}\n\n {% if target_relation %}\n {% do return([true, target_relation]) %}\n {% endif %}\n\n {%- set new_relation = api.Relation.create(\n database=database,\n schema=schema,\n identifier=identifier,\n type=type\n ) -%}\n {% do return([false, new_relation]) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.703263, "supported_languages": null}, "macro.dbt.load_cached_relation": {"name": "load_cached_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.load_cached_relation", "macro_sql": "{% macro load_cached_relation(relation) %}\n {% do return(adapter.get_relation(\n database=relation.database,\n schema=relation.schema,\n identifier=relation.identifier\n )) -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.703477, "supported_languages": null}, "macro.dbt.load_relation": {"name": "load_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.load_relation", "macro_sql": "{% macro load_relation(relation) %}\n {{ return(load_cached_relation(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.703609, "supported_languages": null}, "macro.dbt.drop_relation_if_exists": {"name": "drop_relation_if_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.drop_relation_if_exists", "macro_sql": "{% macro drop_relation_if_exists(relation) %}\n {% if relation is not none %}\n {{ adapter.drop_relation(relation) }}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.703785, "supported_languages": null}, "macro.dbt.collect_freshness": {"name": "collect_freshness", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/freshness.sql", "original_file_path": "macros/adapters/freshness.sql", "unique_id": "macro.dbt.collect_freshness", "macro_sql": "{% macro collect_freshness(source, loaded_at_field, filter) %}\n {{ return(adapter.dispatch('collect_freshness', 'dbt')(source, loaded_at_field, filter))}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__collect_freshness"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.704171, "supported_languages": null}, "macro.dbt.default__collect_freshness": {"name": "default__collect_freshness", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/freshness.sql", "original_file_path": "macros/adapters/freshness.sql", "unique_id": "macro.dbt.default__collect_freshness", "macro_sql": "{% macro default__collect_freshness(source, loaded_at_field, filter) %}\n {% call statement('collect_freshness', fetch_result=True, auto_begin=False) -%}\n select\n max({{ loaded_at_field }}) as max_loaded_at,\n {{ current_timestamp() }} as snapshotted_at\n from {{ source }}\n {% if filter %}\n where {{ filter }}\n {% endif %}\n {% endcall %}\n {{ return(load_result('collect_freshness')) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.704551, "supported_languages": null}, "macro.dbt.validate_sql": {"name": "validate_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/validate_sql.sql", "original_file_path": "macros/adapters/validate_sql.sql", "unique_id": "macro.dbt.validate_sql", "macro_sql": "{% macro validate_sql(sql) -%}\n {{ return(adapter.dispatch('validate_sql', 'dbt')(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__validate_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.70481, "supported_languages": null}, "macro.dbt.default__validate_sql": {"name": "default__validate_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/validate_sql.sql", "original_file_path": "macros/adapters/validate_sql.sql", "unique_id": "macro.dbt.default__validate_sql", "macro_sql": "{% macro default__validate_sql(sql) -%}\n {% call statement('validate_sql') -%}\n explain {{ sql }}\n {% endcall %}\n {{ return(load_result('validate_sql')) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.705013, "supported_languages": null}, "macro.dbt.copy_grants": {"name": "copy_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.copy_grants", "macro_sql": "{% macro copy_grants() %}\n {{ return(adapter.dispatch('copy_grants', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__copy_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.706601, "supported_languages": null}, "macro.dbt.default__copy_grants": {"name": "default__copy_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__copy_grants", "macro_sql": "{% macro default__copy_grants() %}\n {{ return(True) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.706709, "supported_languages": null}, "macro.dbt.support_multiple_grantees_per_dcl_statement": {"name": "support_multiple_grantees_per_dcl_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.support_multiple_grantees_per_dcl_statement", "macro_sql": "{% macro support_multiple_grantees_per_dcl_statement() %}\n {{ return(adapter.dispatch('support_multiple_grantees_per_dcl_statement', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__support_multiple_grantees_per_dcl_statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7068648, "supported_languages": null}, "macro.dbt.default__support_multiple_grantees_per_dcl_statement": {"name": "default__support_multiple_grantees_per_dcl_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__support_multiple_grantees_per_dcl_statement", "macro_sql": "\n\n{%- macro default__support_multiple_grantees_per_dcl_statement() -%}\n {{ return(True) }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7069728, "supported_languages": null}, "macro.dbt.should_revoke": {"name": "should_revoke", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.should_revoke", "macro_sql": "{% macro should_revoke(existing_relation, full_refresh_mode=True) %}\n\n {% if not existing_relation %}\n {#-- The table doesn't already exist, so no grants to copy over --#}\n {{ return(False) }}\n {% elif full_refresh_mode %}\n {#-- The object is being REPLACED -- whether grants are copied over depends on the value of user config --#}\n {{ return(copy_grants()) }}\n {% else %}\n {#-- The table is being merged/upserted/inserted -- grants will be carried over --#}\n {{ return(True) }}\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.copy_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.707289, "supported_languages": null}, "macro.dbt.get_show_grant_sql": {"name": "get_show_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_show_grant_sql", "macro_sql": "{% macro get_show_grant_sql(relation) %}\n {{ return(adapter.dispatch(\"get_show_grant_sql\", \"dbt\")(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_show_grant_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.707461, "supported_languages": null}, "macro.dbt.default__get_show_grant_sql": {"name": "default__get_show_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_show_grant_sql", "macro_sql": "{% macro default__get_show_grant_sql(relation) %}\n show grants on {{ relation }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.707553, "supported_languages": null}, "macro.dbt.get_grant_sql": {"name": "get_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_grant_sql", "macro_sql": "{% macro get_grant_sql(relation, privilege, grantees) %}\n {{ return(adapter.dispatch('get_grant_sql', 'dbt')(relation, privilege, grantees)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_grant_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.70777, "supported_languages": null}, "macro.dbt.default__get_grant_sql": {"name": "default__get_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_grant_sql", "macro_sql": "\n\n{%- macro default__get_grant_sql(relation, privilege, grantees) -%}\n grant {{ privilege }} on {{ relation }} to {{ grantees | join(', ') }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.70795, "supported_languages": null}, "macro.dbt.get_revoke_sql": {"name": "get_revoke_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_revoke_sql", "macro_sql": "{% macro get_revoke_sql(relation, privilege, grantees) %}\n {{ return(adapter.dispatch('get_revoke_sql', 'dbt')(relation, privilege, grantees)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_revoke_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.708166, "supported_languages": null}, "macro.dbt.default__get_revoke_sql": {"name": "default__get_revoke_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_revoke_sql", "macro_sql": "\n\n{%- macro default__get_revoke_sql(relation, privilege, grantees) -%}\n revoke {{ privilege }} on {{ relation }} from {{ grantees | join(', ') }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7085152, "supported_languages": null}, "macro.dbt.get_dcl_statement_list": {"name": "get_dcl_statement_list", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_dcl_statement_list", "macro_sql": "{% macro get_dcl_statement_list(relation, grant_config, get_dcl_macro) %}\n {{ return(adapter.dispatch('get_dcl_statement_list', 'dbt')(relation, grant_config, get_dcl_macro)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_dcl_statement_list"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7087302, "supported_languages": null}, "macro.dbt.default__get_dcl_statement_list": {"name": "default__get_dcl_statement_list", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_dcl_statement_list", "macro_sql": "\n\n{%- macro default__get_dcl_statement_list(relation, grant_config, get_dcl_macro) -%}\n {#\n -- Unpack grant_config into specific privileges and the set of users who need them granted/revoked.\n -- Depending on whether this database supports multiple grantees per statement, pass in the list of\n -- all grantees per privilege, or (if not) template one statement per privilege-grantee pair.\n -- `get_dcl_macro` will be either `get_grant_sql` or `get_revoke_sql`\n #}\n {%- set dcl_statements = [] -%}\n {%- for privilege, grantees in grant_config.items() %}\n {%- if support_multiple_grantees_per_dcl_statement() and grantees -%}\n {%- set dcl = get_dcl_macro(relation, privilege, grantees) -%}\n {%- do dcl_statements.append(dcl) -%}\n {%- else -%}\n {%- for grantee in grantees -%}\n {% set dcl = get_dcl_macro(relation, privilege, [grantee]) %}\n {%- do dcl_statements.append(dcl) -%}\n {% endfor -%}\n {%- endif -%}\n {%- endfor -%}\n {{ return(dcl_statements) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.support_multiple_grantees_per_dcl_statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.709354, "supported_languages": null}, "macro.dbt.call_dcl_statements": {"name": "call_dcl_statements", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.call_dcl_statements", "macro_sql": "{% macro call_dcl_statements(dcl_statement_list) %}\n {{ return(adapter.dispatch(\"call_dcl_statements\", \"dbt\")(dcl_statement_list)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__call_dcl_statements"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.70953, "supported_languages": null}, "macro.dbt.default__call_dcl_statements": {"name": "default__call_dcl_statements", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__call_dcl_statements", "macro_sql": "{% macro default__call_dcl_statements(dcl_statement_list) %}\n {#\n -- By default, supply all grant + revoke statements in a single semicolon-separated block,\n -- so that they're all processed together.\n\n -- Some databases do not support this. Those adapters will need to override this macro\n -- to run each statement individually.\n #}\n {% call statement('grants') %}\n {% for dcl_statement in dcl_statement_list %}\n {{ dcl_statement }};\n {% endfor %}\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7097542, "supported_languages": null}, "macro.dbt.apply_grants": {"name": "apply_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.apply_grants", "macro_sql": "{% macro apply_grants(relation, grant_config, should_revoke) %}\n {{ return(adapter.dispatch(\"apply_grants\", \"dbt\")(relation, grant_config, should_revoke)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__apply_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.70997, "supported_languages": null}, "macro.dbt.default__apply_grants": {"name": "default__apply_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__apply_grants", "macro_sql": "{% macro default__apply_grants(relation, grant_config, should_revoke=True) %}\n {#-- If grant_config is {} or None, this is a no-op --#}\n {% if grant_config %}\n {% if should_revoke %}\n {#-- We think previous grants may have carried over --#}\n {#-- Show current grants and calculate diffs --#}\n {% set current_grants_table = run_query(get_show_grant_sql(relation)) %}\n {% set current_grants_dict = adapter.standardize_grants_dict(current_grants_table) %}\n {% set needs_granting = diff_of_two_dicts(grant_config, current_grants_dict) %}\n {% set needs_revoking = diff_of_two_dicts(current_grants_dict, grant_config) %}\n {% if not (needs_granting or needs_revoking) %}\n {{ log('On ' ~ relation ~': All grants are in place, no revocation or granting needed.')}}\n {% endif %}\n {% else %}\n {#-- We don't think there's any chance of previous grants having carried over. --#}\n {#-- Jump straight to granting what the user has configured. --#}\n {% set needs_revoking = {} %}\n {% set needs_granting = grant_config %}\n {% endif %}\n {% if needs_granting or needs_revoking %}\n {% set revoke_statement_list = get_dcl_statement_list(relation, needs_revoking, get_revoke_sql) %}\n {% set grant_statement_list = get_dcl_statement_list(relation, needs_granting, get_grant_sql) %}\n {% set dcl_statement_list = revoke_statement_list + grant_statement_list %}\n {% if dcl_statement_list %}\n {{ call_dcl_statements(dcl_statement_list) }}\n {% endif %}\n {% endif %}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query", "macro.dbt.get_show_grant_sql", "macro.dbt.get_dcl_statement_list", "macro.dbt.call_dcl_statements"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.711026, "supported_languages": null}, "macro.dbt.alter_column_comment": {"name": "alter_column_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.alter_column_comment", "macro_sql": "{% macro alter_column_comment(relation, column_dict) -%}\n {{ return(adapter.dispatch('alter_column_comment', 'dbt')(relation, column_dict)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_column_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.711659, "supported_languages": null}, "macro.dbt.default__alter_column_comment": {"name": "default__alter_column_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__alter_column_comment", "macro_sql": "{% macro default__alter_column_comment(relation, column_dict) -%}\n {{ exceptions.raise_not_implemented(\n 'alter_column_comment macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7118158, "supported_languages": null}, "macro.dbt.alter_relation_comment": {"name": "alter_relation_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.alter_relation_comment", "macro_sql": "{% macro alter_relation_comment(relation, relation_comment) -%}\n {{ return(adapter.dispatch('alter_relation_comment', 'dbt')(relation, relation_comment)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_relation_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.712002, "supported_languages": null}, "macro.dbt.default__alter_relation_comment": {"name": "default__alter_relation_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__alter_relation_comment", "macro_sql": "{% macro default__alter_relation_comment(relation, relation_comment) -%}\n {{ exceptions.raise_not_implemented(\n 'alter_relation_comment macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.712158, "supported_languages": null}, "macro.dbt.persist_docs": {"name": "persist_docs", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.persist_docs", "macro_sql": "{% macro persist_docs(relation, model, for_relation=true, for_columns=true) -%}\n {{ return(adapter.dispatch('persist_docs', 'dbt')(relation, model, for_relation, for_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.712404, "supported_languages": null}, "macro.dbt.default__persist_docs": {"name": "default__persist_docs", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__persist_docs", "macro_sql": "{% macro default__persist_docs(relation, model, for_relation, for_columns) -%}\n {% if for_relation and config.persist_relation_docs() and model.description %}\n {% do run_query(alter_relation_comment(relation, model.description)) %}\n {% endif %}\n\n {% if for_columns and config.persist_column_docs() and model.columns %}\n {% do run_query(alter_column_comment(relation, model.columns)) %}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query", "macro.dbt.alter_relation_comment", "macro.dbt.alter_column_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.712852, "supported_languages": null}, "macro.dbt.get_catalog": {"name": "get_catalog", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.get_catalog", "macro_sql": "{% macro get_catalog(information_schema, schemas) -%}\n {{ return(adapter.dispatch('get_catalog', 'dbt')(information_schema, schemas)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_catalog"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.714269, "supported_languages": null}, "macro.dbt.default__get_catalog": {"name": "default__get_catalog", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__get_catalog", "macro_sql": "{% macro default__get_catalog(information_schema, schemas) -%}\n\n {% set typename = adapter.type() %}\n {% set msg -%}\n get_catalog not implemented for {{ typename }}\n {%- endset %}\n\n {{ exceptions.raise_compiler_error(msg) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.714504, "supported_languages": null}, "macro.dbt.information_schema_name": {"name": "information_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.information_schema_name", "macro_sql": "{% macro information_schema_name(database) %}\n {{ return(adapter.dispatch('information_schema_name', 'dbt')(database)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__information_schema_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.714676, "supported_languages": null}, "macro.dbt.default__information_schema_name": {"name": "default__information_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__information_schema_name", "macro_sql": "{% macro default__information_schema_name(database) -%}\n {%- if database -%}\n {{ database }}.INFORMATION_SCHEMA\n {%- else -%}\n INFORMATION_SCHEMA\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.714821, "supported_languages": null}, "macro.dbt.list_schemas": {"name": "list_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.list_schemas", "macro_sql": "{% macro list_schemas(database) -%}\n {{ return(adapter.dispatch('list_schemas', 'dbt')(database)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__list_schemas"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.714985, "supported_languages": null}, "macro.dbt.default__list_schemas": {"name": "default__list_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__list_schemas", "macro_sql": "{% macro default__list_schemas(database) -%}\n {% set sql %}\n select distinct schema_name\n from {{ information_schema_name(database) }}.SCHEMATA\n where catalog_name ilike '{{ database }}'\n {% endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.information_schema_name", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.715211, "supported_languages": null}, "macro.dbt.check_schema_exists": {"name": "check_schema_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.check_schema_exists", "macro_sql": "{% macro check_schema_exists(information_schema, schema) -%}\n {{ return(adapter.dispatch('check_schema_exists', 'dbt')(information_schema, schema)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__check_schema_exists"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.715403, "supported_languages": null}, "macro.dbt.default__check_schema_exists": {"name": "default__check_schema_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__check_schema_exists", "macro_sql": "{% macro default__check_schema_exists(information_schema, schema) -%}\n {% set sql -%}\n select count(*)\n from {{ information_schema.replace(information_schema_view='SCHEMATA') }}\n where catalog_name='{{ information_schema.database }}'\n and schema_name='{{ schema }}'\n {%- endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.replace", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7157812, "supported_languages": null}, "macro.dbt.list_relations_without_caching": {"name": "list_relations_without_caching", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.list_relations_without_caching", "macro_sql": "{% macro list_relations_without_caching(schema_relation) %}\n {{ return(adapter.dispatch('list_relations_without_caching', 'dbt')(schema_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__list_relations_without_caching"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7159538, "supported_languages": null}, "macro.dbt.default__list_relations_without_caching": {"name": "default__list_relations_without_caching", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__list_relations_without_caching", "macro_sql": "{% macro default__list_relations_without_caching(schema_relation) %}\n {{ exceptions.raise_not_implemented(\n 'list_relations_without_caching macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.716104, "supported_languages": null}, "macro.dbt.drop_relation": {"name": "drop_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_relation", "macro_sql": "{% macro drop_relation(relation) -%}\n {{ return(adapter.dispatch('drop_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__drop_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.716994, "supported_languages": null}, "macro.dbt.default__drop_relation": {"name": "default__drop_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_relation", "macro_sql": "{% macro default__drop_relation(relation) -%}\n {% call statement('drop_relation', auto_begin=False) -%}\n {%- if relation.is_table -%}\n {{- drop_table(relation) -}}\n {%- elif relation.is_view -%}\n {{- drop_view(relation) -}}\n {%- elif relation.is_materialized_view -%}\n {{- drop_materialized_view(relation) -}}\n {%- else -%}\n drop {{ relation.type }} if exists {{ relation }} cascade\n {%- endif -%}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.drop_table", "macro.dbt.drop_view", "macro.dbt.drop_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.717406, "supported_languages": null}, "macro.dbt.drop_table": {"name": "drop_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_table", "macro_sql": "{% macro drop_table(relation) -%}\n {{ return(adapter.dispatch('drop_table', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.717574, "supported_languages": null}, "macro.dbt.default__drop_table": {"name": "default__drop_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_table", "macro_sql": "{% macro default__drop_table(relation) -%}\n drop table if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.717669, "supported_languages": null}, "macro.dbt.drop_view": {"name": "drop_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_view", "macro_sql": "{% macro drop_view(relation) -%}\n {{ return(adapter.dispatch('drop_view', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.717831, "supported_languages": null}, "macro.dbt.default__drop_view": {"name": "default__drop_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_view", "macro_sql": "{% macro default__drop_view(relation) -%}\n drop view if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.717925, "supported_languages": null}, "macro.dbt.drop_materialized_view": {"name": "drop_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_materialized_view", "macro_sql": "{% macro drop_materialized_view(relation) -%}\n {{ return(adapter.dispatch('drop_materialized_view', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.718084, "supported_languages": null}, "macro.dbt.default__drop_materialized_view": {"name": "default__drop_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_materialized_view", "macro_sql": "{% macro default__drop_materialized_view(relation) -%}\n drop materialized view if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7181811, "supported_languages": null}, "macro.dbt.get_columns_in_relation": {"name": "get_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_columns_in_relation", "macro_sql": "{% macro get_columns_in_relation(relation) -%}\n {{ return(adapter.dispatch('get_columns_in_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_columns_in_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.720292, "supported_languages": null}, "macro.dbt.default__get_columns_in_relation": {"name": "default__get_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_columns_in_relation", "macro_sql": "{% macro default__get_columns_in_relation(relation) -%}\n {{ exceptions.raise_not_implemented(\n 'get_columns_in_relation macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7204459, "supported_languages": null}, "macro.dbt.sql_convert_columns_in_relation": {"name": "sql_convert_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.sql_convert_columns_in_relation", "macro_sql": "{% macro sql_convert_columns_in_relation(table) -%}\n {% set columns = [] %}\n {% for row in table %}\n {% do columns.append(api.Column(*row)) %}\n {% endfor %}\n {{ return(columns) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.720735, "supported_languages": null}, "macro.dbt.get_empty_subquery_sql": {"name": "get_empty_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_empty_subquery_sql", "macro_sql": "{% macro get_empty_subquery_sql(select_sql, select_sql_header=none) -%}\n {{ return(adapter.dispatch('get_empty_subquery_sql', 'dbt')(select_sql, select_sql_header)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.720941, "supported_languages": null}, "macro.dbt.default__get_empty_subquery_sql": {"name": "default__get_empty_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_empty_subquery_sql", "macro_sql": "{% macro default__get_empty_subquery_sql(select_sql, select_sql_header=none) %}\n {%- if select_sql_header is not none -%}\n {{ select_sql_header }}\n {%- endif -%}\n select * from (\n {{ select_sql }}\n ) as __dbt_sbq\n where false\n limit 0\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.72113, "supported_languages": null}, "macro.dbt.get_empty_schema_sql": {"name": "get_empty_schema_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_empty_schema_sql", "macro_sql": "{% macro get_empty_schema_sql(columns) -%}\n {{ return(adapter.dispatch('get_empty_schema_sql', 'dbt')(columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_empty_schema_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7212958, "supported_languages": null}, "macro.dbt.default__get_empty_schema_sql": {"name": "default__get_empty_schema_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_empty_schema_sql", "macro_sql": "{% macro default__get_empty_schema_sql(columns) %}\n {%- set col_err = [] -%}\n select\n {% for i in columns %}\n {%- set col = columns[i] -%}\n {%- if col['data_type'] is not defined -%}\n {{ col_err.append(col['name']) }}\n {%- endif -%}\n {% set col_name = adapter.quote(col['name']) if col.get('quote') else col['name'] %}\n cast(null as {{ col['data_type'] }}) as {{ col_name }}{{ \", \" if not loop.last }}\n {%- endfor -%}\n {%- if (col_err | length) > 0 -%}\n {{ exceptions.column_type_missing(column_names=col_err) }}\n {%- endif -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.722066, "supported_languages": null}, "macro.dbt.get_column_schema_from_query": {"name": "get_column_schema_from_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_column_schema_from_query", "macro_sql": "{% macro get_column_schema_from_query(select_sql, select_sql_header=none) -%}\n {% set columns = [] %}\n {# -- Using an 'empty subquery' here to get the same schema as the given select_sql statement, without necessitating a data scan.#}\n {% set sql = get_empty_subquery_sql(select_sql, select_sql_header) %}\n {% set column_schema = adapter.get_column_schema_from_query(sql) %}\n {{ return(column_schema) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7223911, "supported_languages": null}, "macro.dbt.get_columns_in_query": {"name": "get_columns_in_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_columns_in_query", "macro_sql": "{% macro get_columns_in_query(select_sql) -%}\n {{ return(adapter.dispatch('get_columns_in_query', 'dbt')(select_sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_columns_in_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.722563, "supported_languages": null}, "macro.dbt.default__get_columns_in_query": {"name": "default__get_columns_in_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_columns_in_query", "macro_sql": "{% macro default__get_columns_in_query(select_sql) %}\n {% call statement('get_columns_in_query', fetch_result=True, auto_begin=False) -%}\n {{ get_empty_subquery_sql(select_sql) }}\n {% endcall %}\n {{ return(load_result('get_columns_in_query').table.columns | map(attribute='name') | list) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7228858, "supported_languages": null}, "macro.dbt.alter_column_type": {"name": "alter_column_type", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.alter_column_type", "macro_sql": "{% macro alter_column_type(relation, column_name, new_column_type) -%}\n {{ return(adapter.dispatch('alter_column_type', 'dbt')(relation, column_name, new_column_type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_column_type"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.723116, "supported_languages": null}, "macro.dbt.default__alter_column_type": {"name": "default__alter_column_type", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__alter_column_type", "macro_sql": "{% macro default__alter_column_type(relation, column_name, new_column_type) -%}\n {#\n 1. Create a new column (w/ temp name and correct type)\n 2. Copy data over to it\n 3. Drop the existing column (cascade!)\n 4. Rename the new column to existing column\n #}\n {%- set tmp_column = column_name + \"__dbt_alter\" -%}\n\n {% call statement('alter_column_type') %}\n alter table {{ relation }} add column {{ adapter.quote(tmp_column) }} {{ new_column_type }};\n update {{ relation }} set {{ adapter.quote(tmp_column) }} = {{ adapter.quote(column_name) }};\n alter table {{ relation }} drop column {{ adapter.quote(column_name) }} cascade;\n alter table {{ relation }} rename column {{ adapter.quote(tmp_column) }} to {{ adapter.quote(column_name) }}\n {% endcall %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.72368, "supported_languages": null}, "macro.dbt.alter_relation_add_remove_columns": {"name": "alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.alter_relation_add_remove_columns", "macro_sql": "{% macro alter_relation_add_remove_columns(relation, add_columns = none, remove_columns = none) -%}\n {{ return(adapter.dispatch('alter_relation_add_remove_columns', 'dbt')(relation, add_columns, remove_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.723917, "supported_languages": null}, "macro.dbt.default__alter_relation_add_remove_columns": {"name": "default__alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__alter_relation_add_remove_columns", "macro_sql": "{% macro default__alter_relation_add_remove_columns(relation, add_columns, remove_columns) %}\n\n {% if add_columns is none %}\n {% set add_columns = [] %}\n {% endif %}\n {% if remove_columns is none %}\n {% set remove_columns = [] %}\n {% endif %}\n\n {% set sql -%}\n\n alter {{ relation.type }} {{ relation }}\n\n {% for column in add_columns %}\n add column {{ column.name }} {{ column.data_type }}{{ ',' if not loop.last }}\n {% endfor %}{{ ',' if add_columns and remove_columns }}\n\n {% for column in remove_columns %}\n drop column {{ column.name }}{{ ',' if not loop.last }}\n {% endfor %}\n\n {%- endset -%}\n\n {% do run_query(sql) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7246342, "supported_languages": null}, "macro.dbt.resolve_model_name": {"name": "resolve_model_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.resolve_model_name", "macro_sql": "{% macro resolve_model_name(input_model_name) %}\n {{ return(adapter.dispatch('resolve_model_name', 'dbt')(input_model_name)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.726177, "supported_languages": null}, "macro.dbt.default__resolve_model_name": {"name": "default__resolve_model_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.default__resolve_model_name", "macro_sql": "\n\n{%- macro default__resolve_model_name(input_model_name) -%}\n {{ input_model_name | string | replace('\"', '\\\"') }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.726323, "supported_languages": null}, "macro.dbt.build_ref_function": {"name": "build_ref_function", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_ref_function", "macro_sql": "{% macro build_ref_function(model) %}\n\n {%- set ref_dict = {} -%}\n {%- for _ref in model.refs -%}\n {% set _ref_args = [_ref.get('package'), _ref['name']] if _ref.get('package') else [_ref['name'],] %}\n {%- set resolved = ref(*_ref_args, v=_ref.get('version')) -%}\n {%- if _ref.get('version') -%}\n {% do _ref_args.extend([\"v\" ~ _ref['version']]) %}\n {%- endif -%}\n {%- do ref_dict.update({_ref_args | join('.'): resolve_model_name(resolved)}) -%}\n {%- endfor -%}\n\ndef ref(*args, **kwargs):\n refs = {{ ref_dict | tojson }}\n key = '.'.join(args)\n version = kwargs.get(\"v\") or kwargs.get(\"version\")\n if version:\n key += f\".v{version}\"\n dbt_load_df_function = kwargs.get(\"dbt_load_df_function\")\n return dbt_load_df_function(refs[key])\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.727083, "supported_languages": null}, "macro.dbt.build_source_function": {"name": "build_source_function", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_source_function", "macro_sql": "{% macro build_source_function(model) %}\n\n {%- set source_dict = {} -%}\n {%- for _source in model.sources -%}\n {%- set resolved = source(*_source) -%}\n {%- do source_dict.update({_source | join('.'): resolve_model_name(resolved)}) -%}\n {%- endfor -%}\n\ndef source(*args, dbt_load_df_function):\n sources = {{ source_dict | tojson }}\n key = '.'.join(args)\n return dbt_load_df_function(sources[key])\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.727464, "supported_languages": null}, "macro.dbt.build_config_dict": {"name": "build_config_dict", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_config_dict", "macro_sql": "{% macro build_config_dict(model) %}\n {%- set config_dict = {} -%}\n {% set config_dbt_used = zip(model.config.config_keys_used, model.config.config_keys_defaults) | list %}\n {%- for key, default in config_dbt_used -%}\n {# weird type testing with enum, would be much easier to write this logic in Python! #}\n {%- if key == \"language\" -%}\n {%- set value = \"python\" -%}\n {%- endif -%}\n {%- set value = model.config.get(key, default) -%}\n {%- do config_dict.update({key: value}) -%}\n {%- endfor -%}\nconfig_dict = {{ config_dict }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.728001, "supported_languages": null}, "macro.dbt.py_script_postfix": {"name": "py_script_postfix", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.py_script_postfix", "macro_sql": "{% macro py_script_postfix(model) %}\n# This part is user provided model code\n# you will need to copy the next section to run the code\n# COMMAND ----------\n# this part is dbt logic for get ref work, do not modify\n\n{{ build_ref_function(model ) }}\n{{ build_source_function(model ) }}\n{{ build_config_dict(model) }}\n\nclass config:\n def __init__(self, *args, **kwargs):\n pass\n\n @staticmethod\n def get(key, default=None):\n return config_dict.get(key, default)\n\nclass this:\n \"\"\"dbt.this() or dbt.this.identifier\"\"\"\n database = \"{{ this.database }}\"\n schema = \"{{ this.schema }}\"\n identifier = \"{{ this.identifier }}\"\n {% set this_relation_name = resolve_model_name(this) %}\n def __repr__(self):\n return '{{ this_relation_name }}'\n\n\nclass dbtObj:\n def __init__(self, load_df_function) -> None:\n self.source = lambda *args: source(*args, dbt_load_df_function=load_df_function)\n self.ref = lambda *args, **kwargs: ref(*args, **kwargs, dbt_load_df_function=load_df_function)\n self.config = config\n self.this = this()\n self.is_incremental = {{ is_incremental() }}\n\n# COMMAND ----------\n{{py_script_comment()}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.build_ref_function", "macro.dbt.build_source_function", "macro.dbt.build_config_dict", "macro.dbt.resolve_model_name", "macro.dbt.is_incremental", "macro.dbt.py_script_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.728439, "supported_languages": null}, "macro.dbt.py_script_comment": {"name": "py_script_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.py_script_comment", "macro_sql": "{%macro py_script_comment()%}\n{%endmacro%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7285051, "supported_languages": null}, "macro.dbt.test_unique": {"name": "test_unique", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_unique", "macro_sql": "{% test unique(model, column_name) %}\n {% set macro = adapter.dispatch('test_unique', 'dbt') %}\n {{ macro(model, column_name) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_unique"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.7289732, "supported_languages": null}, "macro.dbt.test_not_null": {"name": "test_not_null", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_not_null", "macro_sql": "{% test not_null(model, column_name) %}\n {% set macro = adapter.dispatch('test_not_null', 'dbt') %}\n {{ macro(model, column_name) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_not_null"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.729186, "supported_languages": null}, "macro.dbt.test_accepted_values": {"name": "test_accepted_values", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_accepted_values", "macro_sql": "{% test accepted_values(model, column_name, values, quote=True) %}\n {% set macro = adapter.dispatch('test_accepted_values', 'dbt') %}\n {{ macro(model, column_name, values, quote) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_accepted_values"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.729454, "supported_languages": null}, "macro.dbt.test_relationships": {"name": "test_relationships", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_relationships", "macro_sql": "{% test relationships(model, column_name, to, field) %}\n {% set macro = adapter.dispatch('test_relationships', 'dbt') %}\n {{ macro(model, column_name, to, field) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_relationships"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1694181385.729708, "supported_languages": null}}, "docs": {"doc.dbt.__overview__": {"name": "__overview__", "resource_type": "doc", "package_name": "dbt", "path": "overview.md", "original_file_path": "docs/overview.md", "unique_id": "doc.dbt.__overview__", "block_contents": "### Welcome!\n\nWelcome to the auto-generated documentation for your dbt project!\n\n### Navigation\n\nYou can use the `Project` and `Database` navigation tabs on the left side of the window to explore the models\nin your project.\n\n#### Project Tab\nThe `Project` tab mirrors the directory structure of your dbt project. In this tab, you can see all of the\nmodels defined in your dbt project, as well as models imported from dbt packages.\n\n#### Database Tab\nThe `Database` tab also exposes your models, but in a format that looks more like a database explorer. This view\nshows relations (tables and views) grouped into database schemas. Note that ephemeral models are _not_ shown\nin this interface, as they do not exist in the database.\n\n### Graph Exploration\nYou can click the blue icon on the bottom-right corner of the page to view the lineage graph of your models.\n\nOn model pages, you'll see the immediate parents and children of the model you're exploring. By clicking the `Expand`\nbutton at the top-right of this lineage pane, you'll be able to see all of the models that are used to build,\nor are built from, the model you're exploring.\n\nOnce expanded, you'll be able to use the `--select` and `--exclude` model selection syntax to filter the\nmodels in the graph. For more information on model selection, check out the [dbt docs](https://docs.getdbt.com/docs/model-selection-syntax).\n\nNote that you can also right-click on models to interactively filter and explore the graph.\n\n---\n\n### More information\n\n- [What is dbt](https://docs.getdbt.com/docs/introduction)?\n- Read the [dbt viewpoint](https://docs.getdbt.com/docs/viewpoint)\n- [Installation](https://docs.getdbt.com/docs/installation)\n- Join the [dbt Community](https://www.getdbt.com/community/) for questions and discussion"}}, "exposures": {}, "metrics": {}, "groups": {}, "selectors": {}, "disabled": {}, "parent_map": {"model.hacker_news_dbt.comment_daily_stats": ["source.hacker_news_dbt.core.comments"], "model.hacker_news_dbt.activity_daily_stats": ["model.hacker_news_dbt.comment_daily_stats", "model.hacker_news_dbt.story_daily_stats"], "model.hacker_news_dbt.story_daily_stats": ["source.hacker_news_dbt.core.stories"], "test.hacker_news_dbt.assert_true": [], "seed.hacker_news_dbt.full_sample": [], "source.hacker_news_dbt.core.comments": [], "source.hacker_news_dbt.core.stories": []}, "child_map": {"model.hacker_news_dbt.comment_daily_stats": ["model.hacker_news_dbt.activity_daily_stats"], "model.hacker_news_dbt.activity_daily_stats": [], "model.hacker_news_dbt.story_daily_stats": ["model.hacker_news_dbt.activity_daily_stats"], "test.hacker_news_dbt.assert_true": [], "seed.hacker_news_dbt.full_sample": [], "source.hacker_news_dbt.core.comments": ["model.hacker_news_dbt.comment_daily_stats"], "source.hacker_news_dbt.core.stories": ["model.hacker_news_dbt.story_daily_stats"]}, "group_map": {}, "semantic_models": {}} \ No newline at end of file +{"metadata": {"dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v10.json", "dbt_version": "1.6.6", "generated_at": "2023-10-11T23:11:16.454865Z", "invocation_id": "cbb264ee-f50c-4a81-a45b-4002bb2020ec", "env": {}, "project_name": "hacker_news_dbt", "project_id": "822c572c4d9d54bd8df768554843c306", "user_id": null, "send_anonymous_usage_stats": true, "adapter_type": "duckdb"}, "nodes": {"model.hacker_news_dbt.comment_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "comment_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/comment_daily_stats.sql", "original_file_path": "models/activity_analytics/comment_daily_stats.sql", "unique_id": "model.hacker_news_dbt.comment_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "comment_daily_stats"], "alias": "comment_daily_stats", "checksum": {"name": "sha256", "checksum": "e824d040f11e1a1d2ca71fe34f2088b57a6c2882c92b2ee1682565db253f9e1c"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Summary of comment activity by day", "columns": {"date": {"name": "date", "description": "The date that the stories were posted on.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "num_comments": {"name": "num_comments", "description": "The number of comments posted.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "posting_users": {"name": "posting_users", "description": "The number of unique users who posted a comment.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1697065878.052214, "relation_name": "\"hackernews\".\"activity_analytics\".\"comment_daily_stats\"", "raw_code": "SELECT date_trunc('day', to_timestamp(time::int)) as date,\n count(DISTINCT user_id) AS commenting_users,\n count(*) AS num_comments\nFROM {{ source('core', 'comments') }}\nGROUP BY 1", "language": "sql", "refs": [], "sources": [["core", "comments"]], "metrics": [], "depends_on": {"macros": [], "nodes": ["source.hacker_news_dbt.core.comments"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "model.hacker_news_dbt.activity_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "activity_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/activity_daily_stats.sql", "original_file_path": "models/activity_analytics/activity_daily_stats.sql", "unique_id": "model.hacker_news_dbt.activity_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "activity_daily_stats"], "alias": "activity_daily_stats", "checksum": {"name": "sha256", "checksum": "3bb7e086ca21de591744100ca30375f1b10b0164b34121f269360af41613a749"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Combined stats about activity on each day", "columns": {}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1697065878.052497, "relation_name": "\"hackernews\".\"activity_analytics\".\"activity_daily_stats\"", "raw_code": "SELECT *\nFROM {{ ref('comment_daily_stats') }}\nFULL OUTER JOIN {{ ref('story_daily_stats') }}\nUSING (date)", "language": "sql", "refs": [{"name": "comment_daily_stats", "package": null, "version": null}, {"name": "story_daily_stats", "package": null, "version": null}], "sources": [], "metrics": [], "depends_on": {"macros": [], "nodes": ["model.hacker_news_dbt.comment_daily_stats", "model.hacker_news_dbt.story_daily_stats"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "model.hacker_news_dbt.story_daily_stats": {"database": "hackernews", "schema": "activity_analytics", "name": "story_daily_stats", "resource_type": "model", "package_name": "hacker_news_dbt", "path": "activity_analytics/story_daily_stats.sql", "original_file_path": "models/activity_analytics/story_daily_stats.sql", "unique_id": "model.hacker_news_dbt.story_daily_stats", "fqn": ["hacker_news_dbt", "activity_analytics", "story_daily_stats"], "alias": "story_daily_stats", "checksum": {"name": "sha256", "checksum": "93633f2202f8d705d941c2e61c0fbbd2a436ad106046de121328c6e4205e526f"}, "config": {"enabled": true, "alias": null, "schema": "activity_analytics", "database": null, "tags": [], "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "materialized": "table", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "post-hook": [], "pre-hook": []}, "tags": [], "description": "Summary of posting activity by day", "columns": {"date": {"name": "date", "description": "The date that the stories were posted on.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "num_stories": {"name": "num_stories", "description": "The number of stories posted.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}, "posting_users": {"name": "posting_users", "description": "The number of unique users who posted a story.", "meta": {}, "data_type": null, "constraints": [], "quote": null, "tags": []}}, "meta": {"dagster": {"group": "activity_analytics"}}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": "hacker_news_dbt://models/schema.yml", "build_path": null, "deferred": false, "unrendered_config": {"materialized": "table", "schema": "activity_analytics", "meta": {"dagster": {"group": "activity_analytics"}}}, "created_at": 1697065878.051369, "relation_name": "\"hackernews\".\"activity_analytics\".\"story_daily_stats\"", "raw_code": "SELECT date_trunc('day', to_timestamp(time::int)) as date,\n count(DISTINCT user_id) AS posting_users,\n count(*) AS num_stories\nFROM {{ source('core', 'stories') }}\nGROUP BY 1", "language": "sql", "refs": [], "sources": [["core", "stories"]], "metrics": [], "depends_on": {"macros": [], "nodes": ["source.hacker_news_dbt.core.stories"]}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}, "access": "protected", "constraints": [], "version": null, "latest_version": null, "deprecation_date": null}, "test.hacker_news_dbt.assert_true": {"database": "hackernews", "schema": "dbt_test__audit", "name": "assert_true", "resource_type": "test", "package_name": "hacker_news_dbt", "path": "assert_true.sql", "original_file_path": "tests/assert_true.sql", "unique_id": "test.hacker_news_dbt.assert_true", "fqn": ["hacker_news_dbt", "assert_true"], "alias": "assert_true", "checksum": {"name": "sha256", "checksum": "e0c343d949749324dcbf7013326a3451c98dbcc737c10bf21fc073a259112e6d"}, "config": {"enabled": true, "alias": null, "schema": "dbt_test__audit", "database": null, "tags": [], "meta": {}, "group": null, "materialized": "test", "severity": "ERROR", "store_failures": null, "where": null, "limit": null, "fail_calc": "count(*)", "warn_if": "!= 0", "error_if": "!= 0"}, "tags": [], "description": "", "columns": {}, "meta": {}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": null, "build_path": null, "deferred": false, "unrendered_config": {}, "created_at": 1697065877.830505, "relation_name": null, "raw_code": "SELECT 'Hello, Tests' LIMIT 0", "language": "sql", "refs": [], "sources": [], "metrics": [], "depends_on": {"macros": [], "nodes": []}, "compiled_path": null, "contract": {"enforced": false, "checksum": null}}, "seed.hacker_news_dbt.full_sample": {"database": "hackernews", "schema": "public", "name": "full_sample", "resource_type": "seed", "package_name": "hacker_news_dbt", "path": "full_sample.csv", "original_file_path": "data/full_sample.csv", "unique_id": "seed.hacker_news_dbt.full_sample", "fqn": ["hacker_news_dbt", "full_sample"], "alias": "full_sample", "checksum": {"name": "sha256", "checksum": "40a06e8e17973fbdc477234066f913ca192dd8e14502f88e5d208eaf679c604b"}, "config": {"enabled": true, "alias": null, "schema": null, "database": null, "tags": [], "meta": {}, "group": null, "materialized": "seed", "incremental_strategy": null, "persist_docs": {}, "quoting": {}, "column_types": {}, "full_refresh": null, "unique_key": null, "on_schema_change": "ignore", "on_configuration_change": "apply", "grants": {}, "packages": [], "docs": {"show": true, "node_color": null}, "contract": {"enforced": false}, "quote_columns": false, "post-hook": [], "pre-hook": []}, "tags": [], "description": "", "columns": {}, "meta": {}, "group": null, "docs": {"show": true, "node_color": null}, "patch_path": null, "build_path": null, "deferred": false, "unrendered_config": {"quote_columns": false}, "created_at": 1697065877.839774, "relation_name": "\"hackernews\".\"public\".\"full_sample\"", "raw_code": "", "root_path": "/Users/pedram/projects/dagster/examples/project_fully_featured/dbt_project", "depends_on": {"macros": []}}}, "sources": {"source.hacker_news_dbt.core.comments": {"database": "hackernews", "schema": "core", "name": "comments", "resource_type": "source", "package_name": "hacker_news_dbt", "path": "models/sources.yml", "original_file_path": "models/sources.yml", "unique_id": "source.hacker_news_dbt.core.comments", "fqn": ["hacker_news_dbt", "core", "comments"], "source_name": "core", "source_description": "", "loader": "", "identifier": "comments", "quoting": {"database": null, "schema": null, "identifier": null, "column": null}, "loaded_at_field": null, "freshness": {"warn_after": {"count": null, "period": null}, "error_after": {"count": null, "period": null}, "filter": null}, "external": null, "description": "", "columns": {}, "meta": {}, "source_meta": {}, "tags": [], "config": {"enabled": true}, "patch_path": null, "unrendered_config": {}, "relation_name": "\"hackernews\".\"core\".\"comments\"", "created_at": 1697065878.071954}, "source.hacker_news_dbt.core.stories": {"database": "hackernews", "schema": "core", "name": "stories", "resource_type": "source", "package_name": "hacker_news_dbt", "path": "models/sources.yml", "original_file_path": "models/sources.yml", "unique_id": "source.hacker_news_dbt.core.stories", "fqn": ["hacker_news_dbt", "core", "stories"], "source_name": "core", "source_description": "", "loader": "", "identifier": "stories", "quoting": {"database": null, "schema": null, "identifier": null, "column": null}, "loaded_at_field": null, "freshness": {"warn_after": {"count": null, "period": null}, "error_after": {"count": null, "period": null}, "filter": null}, "external": null, "description": "", "columns": {}, "meta": {}, "source_meta": {}, "tags": [], "config": {"enabled": true}, "patch_path": null, "unrendered_config": {}, "relation_name": "\"hackernews\".\"core\".\"stories\"", "created_at": 1697065878.072097}}, "macros": {"macro.hacker_news_dbt.aggregate_actions": {"name": "aggregate_actions", "resource_type": "macro", "package_name": "hacker_news_dbt", "path": "macros/aggregate_actions.sql", "original_file_path": "macros/aggregate_actions.sql", "unique_id": "macro.hacker_news_dbt.aggregate_actions", "macro_sql": "{% macro aggregate_actions(table) %}\n SELECT\n COUNT(*) as num_actions,\n \"by\"\n FROM {{ table }}\n WHERE \"by\" IS NOT NULL\n GROUP BY \"by\"\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.490777, "supported_languages": null}, "macro.hacker_news_dbt.generate_schema_name": {"name": "generate_schema_name", "resource_type": "macro", "package_name": "hacker_news_dbt", "path": "macros/generate_schema_name.sql", "original_file_path": "macros/generate_schema_name.sql", "unique_id": "macro.hacker_news_dbt.generate_schema_name", "macro_sql": "{% macro generate_schema_name(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if custom_schema_name is none -%}\n\n public\n\n {%- else -%}\n\n {{ custom_schema_name | trim }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.491178, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_binding_char": {"name": "duckdb__get_binding_char", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_binding_char", "macro_sql": "{% macro duckdb__get_binding_char() %}\n {{ return(adapter.get_binding_char()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.520342, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_batch_size": {"name": "duckdb__get_batch_size", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_batch_size", "macro_sql": "{% macro duckdb__get_batch_size() %}\n {{ return(10000) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.520564, "supported_languages": null}, "macro.dbt_duckdb.duckdb__load_csv_rows": {"name": "duckdb__load_csv_rows", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/seed.sql", "original_file_path": "macros/seed.sql", "unique_id": "macro.dbt_duckdb.duckdb__load_csv_rows", "macro_sql": "{% macro duckdb__load_csv_rows(model, agate_table) %}\n {% if config.get('fast', true) %}\n {% set seed_file_path = adapter.get_seed_file_path(model) %}\n {% set sql %}\n COPY {{ this.render() }} FROM '{{ seed_file_path }}' (FORMAT CSV, HEADER TRUE)\n {% endset %}\n {% do adapter.add_query(sql, abridge_sql_log=True) %}\n {{ return(sql) }}\n {% endif %}\n\n {% set batch_size = get_batch_size() %}\n {% set agate_table = adapter.convert_datetimes_to_strs(agate_table) %}\n {% set cols_sql = get_seed_column_quoted_csv(model, agate_table.column_names) %}\n {% set bindings = [] %}\n\n {% set statements = [] %}\n\n {% for chunk in agate_table.rows | batch(batch_size) %}\n {% set bindings = [] %}\n\n {% for row in chunk %}\n {% do bindings.extend(row) %}\n {% endfor %}\n\n {% set sql %}\n insert into {{ this.render() }} ({{ cols_sql }}) values\n {% for row in chunk -%}\n ({%- for column in agate_table.column_names -%}\n {{ get_binding_char() }}\n {%- if not loop.last%},{%- endif %}\n {%- endfor -%})\n {%- if not loop.last%},{%- endif %}\n {%- endfor %}\n {% endset %}\n\n {% do adapter.add_query(sql, bindings=bindings, abridge_sql_log=True) %}\n\n {% if loop.index0 == 0 %}\n {% do statements.append(sql) %}\n {% endif %}\n {% endfor %}\n\n {# Return SQL so we can render it out into the compiled files #}\n {{ return(statements[0]) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_batch_size", "macro.dbt.get_seed_column_quoted_csv", "macro.dbt.get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.522795, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_merge_sql": {"name": "duckdb__snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_merge_sql", "macro_sql": "{% macro duckdb__snapshot_merge_sql(target, source, insert_cols) -%}\n {%- set insert_cols_csv = insert_cols | join(', ') -%}\n\n update {{ target }} as DBT_INTERNAL_TARGET\n set dbt_valid_to = DBT_INTERNAL_SOURCE.dbt_valid_to\n from {{ source }} as DBT_INTERNAL_SOURCE\n where DBT_INTERNAL_SOURCE.dbt_scd_id::text = DBT_INTERNAL_TARGET.dbt_scd_id::text\n and DBT_INTERNAL_SOURCE.dbt_change_type::text in ('update'::text, 'delete'::text)\n and DBT_INTERNAL_TARGET.dbt_valid_to is null;\n\n insert into {{ target }} ({{ insert_cols_csv }})\n select {% for column in insert_cols -%}\n DBT_INTERNAL_SOURCE.{{ column }} {%- if not loop.last %}, {%- endif %}\n {%- endfor %}\n from {{ source }} as DBT_INTERNAL_SOURCE\n where DBT_INTERNAL_SOURCE.dbt_change_type::text = 'insert'::text;\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.523863, "supported_languages": null}, "macro.dbt_duckdb.build_snapshot_staging_table": {"name": "build_snapshot_staging_table", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.build_snapshot_staging_table", "macro_sql": "{% macro build_snapshot_staging_table(strategy, sql, target_relation) %}\n {% set temp_relation = make_temp_relation(target_relation) %}\n\n {% set select = snapshot_staging_table(strategy, sql, target_relation) %}\n\n {% call statement('build_snapshot_staging_relation') %}\n {{ create_table_as(False, temp_relation, select) }}\n {% endcall %}\n\n {% do return(temp_relation) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_temp_relation", "macro.dbt.snapshot_staging_table", "macro.dbt.statement", "macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.524377, "supported_languages": null}, "macro.dbt_duckdb.duckdb__post_snapshot": {"name": "duckdb__post_snapshot", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/snapshot_helper.sql", "original_file_path": "macros/snapshot_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__post_snapshot", "macro_sql": "{% macro duckdb__post_snapshot(staging_relation) %}\n {% do return(drop_relation(staging_relation)) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.drop_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.524565, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_catalog": {"name": "duckdb__get_catalog", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/catalog.sql", "original_file_path": "macros/catalog.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_catalog", "macro_sql": "{% macro duckdb__get_catalog(information_schema, schemas) -%}\n {%- call statement('catalog', fetch_result=True) -%}\n select\n '{{ database }}' as table_database,\n t.table_schema,\n t.table_name,\n t.table_type,\n '' as table_comment,\n c.column_name,\n c.ordinal_position as column_index,\n c.data_type column_type,\n '' as column_comment,\n '' as table_owner\n FROM information_schema.tables t JOIN information_schema.columns c ON t.table_schema = c.table_schema AND t.table_name = c.table_name\n WHERE (\n {%- for schema in schemas -%}\n upper(t.table_schema) = upper('{{ schema }}'){%- if not loop.last %} or {% endif -%}\n {%- endfor -%}\n )\n AND t.table_type IN ('BASE TABLE', 'VIEW')\n ORDER BY\n t.table_schema,\n t.table_name,\n c.ordinal_position\n {%- endcall -%}\n {{ return(load_result('catalog').table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.5253031, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_schema": {"name": "duckdb__create_schema", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_schema", "macro_sql": "{% macro duckdb__create_schema(relation) -%}\n {%- call statement('create_schema') -%}\n create schema if not exists {{ relation.without_identifier() }}\n {%- endcall -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6036868, "supported_languages": null}, "macro.dbt_duckdb.duckdb__drop_schema": {"name": "duckdb__drop_schema", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__drop_schema", "macro_sql": "{% macro duckdb__drop_schema(relation) -%}\n {%- call statement('drop_schema') -%}\n drop schema if exists {{ relation.without_identifier() }} cascade\n {%- endcall -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.60392, "supported_languages": null}, "macro.dbt_duckdb.duckdb__list_schemas": {"name": "duckdb__list_schemas", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__list_schemas", "macro_sql": "{% macro duckdb__list_schemas(database) -%}\n {% set sql %}\n select schema_name\n from system.information_schema.schemata\n {% if database is not none %}\n where catalog_name = '{{ database }}'\n {% endif %}\n {% endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6042318, "supported_languages": null}, "macro.dbt_duckdb.duckdb__check_schema_exists": {"name": "duckdb__check_schema_exists", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__check_schema_exists", "macro_sql": "{% macro duckdb__check_schema_exists(information_schema, schema) -%}\n {% set sql -%}\n select count(*)\n from system.information_schema.schemata\n where schema_name = '{{ schema }}'\n and catalog_name = '{{ information_schema.database }}'\n {%- endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.604512, "supported_languages": null}, "macro.dbt_duckdb.get_column_names": {"name": "get_column_names", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.get_column_names", "macro_sql": "{% macro get_column_names() %}\n {# loop through user_provided_columns to get column names #}\n {%- set user_provided_columns = model['columns'] -%}\n (\n {% for i in user_provided_columns %}\n {% set col = user_provided_columns[i] %}\n {{ col['name'] }} {{ \",\" if not loop.last }}\n {% endfor %}\n )\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.604918, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_table_as": {"name": "duckdb__create_table_as", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_table_as", "macro_sql": "{% macro duckdb__create_table_as(temporary, relation, compiled_code, language='sql') -%}\n {%- if language == 'sql' -%}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(compiled_code) }}\n {% endif %}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n\n create {% if temporary: -%}temporary{%- endif %} table\n {{ relation.include(database=(not temporary), schema=(not temporary)) }}\n {% if contract_config.enforced and not temporary %}\n {#-- DuckDB doesnt support constraints on temp tables --#}\n {{ get_table_columns_and_constraints() }} ;\n insert into {{ relation }} {{ get_column_names() }} (\n {{ get_select_subquery(compiled_code) }}\n );\n {% else %}\n as (\n {{ compiled_code }}\n );\n {% endif %}\n {%- elif language == 'python' -%}\n {{ py_write_table(temporary=temporary, relation=relation, compiled_code=compiled_code) }}\n {%- else -%}\n {% do exceptions.raise_compiler_error(\"duckdb__create_table_as macro didn't get supported language, it got %s\" % language) %}\n {%- endif -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent", "macro.dbt.get_table_columns_and_constraints", "macro.dbt_duckdb.get_column_names", "macro.dbt.get_select_subquery", "macro.dbt_duckdb.py_write_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.606209, "supported_languages": null}, "macro.dbt_duckdb.py_write_table": {"name": "py_write_table", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.py_write_table", "macro_sql": "{% macro py_write_table(temporary, relation, compiled_code) -%}\n{{ compiled_code }}\n\ndef materialize(df, con):\n try:\n import pyarrow\n pyarrow_available = True\n except ImportError:\n pyarrow_available = False\n finally:\n if pyarrow_available and isinstance(df, pyarrow.Table):\n # https://github.com/duckdb/duckdb/issues/6584\n import pyarrow.dataset\n con.execute('create table {{ relation }} as select * from df')\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6063979, "supported_languages": null}, "macro.dbt_duckdb.duckdb__create_view_as": {"name": "duckdb__create_view_as", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__create_view_as", "macro_sql": "{% macro duckdb__create_view_as(relation, sql) -%}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(sql) }}\n {%- endif %}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n create view {{ relation }} as (\n {{ sql }}\n );\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.606864, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_columns_in_relation": {"name": "duckdb__get_columns_in_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_columns_in_relation", "macro_sql": "{% macro duckdb__get_columns_in_relation(relation) -%}\n {% call statement('get_columns_in_relation', fetch_result=True) %}\n select\n column_name,\n data_type,\n character_maximum_length,\n numeric_precision,\n numeric_scale\n\n from system.information_schema.columns\n where table_name = '{{ relation.identifier }}'\n {% if relation.schema %}\n and table_schema = '{{ relation.schema }}'\n {% endif %}\n {% if relation.database %}\n and table_catalog = '{{ relation.database }}'\n {% endif %}\n order by ordinal_position\n\n {% endcall %}\n {% set table = load_result('get_columns_in_relation').table %}\n {{ return(sql_convert_columns_in_relation(table)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.sql_convert_columns_in_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6074328, "supported_languages": null}, "macro.dbt_duckdb.duckdb__list_relations_without_caching": {"name": "duckdb__list_relations_without_caching", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__list_relations_without_caching", "macro_sql": "{% macro duckdb__list_relations_without_caching(schema_relation) %}\n {% call statement('list_relations_without_caching', fetch_result=True) -%}\n select\n '{{ schema_relation.database }}' as database,\n table_name as name,\n table_schema as schema,\n CASE table_type\n WHEN 'BASE TABLE' THEN 'table'\n WHEN 'VIEW' THEN 'view'\n WHEN 'LOCAL TEMPORARY' THEN 'table'\n END as type\n from system.information_schema.tables\n where table_schema = '{{ schema_relation.schema }}'\n and table_catalog = '{{ schema_relation.database }}'\n {% endcall %}\n {{ return(load_result('list_relations_without_caching').table) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.607811, "supported_languages": null}, "macro.dbt_duckdb.duckdb__drop_relation": {"name": "duckdb__drop_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__drop_relation", "macro_sql": "{% macro duckdb__drop_relation(relation) -%}\n {% call statement('drop_relation', auto_begin=False) -%}\n drop {{ relation.type }} if exists {{ relation }} cascade\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.614732, "supported_languages": null}, "macro.dbt_duckdb.duckdb__rename_relation": {"name": "duckdb__rename_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__rename_relation", "macro_sql": "{% macro duckdb__rename_relation(from_relation, to_relation) -%}\n {% set target_name = adapter.quote_as_configured(to_relation.identifier, 'identifier') %}\n {% call statement('rename_relation') -%}\n alter {{ to_relation.type }} {{ from_relation }} rename to {{ target_name }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.654179, "supported_languages": null}, "macro.dbt_duckdb.duckdb__make_temp_relation": {"name": "duckdb__make_temp_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__make_temp_relation", "macro_sql": "{% macro duckdb__make_temp_relation(base_relation, suffix) %}\n {% set tmp_identifier = base_relation.identifier ~ suffix ~ py_current_timestring() %}\n {% do return(base_relation.incorporate(\n path={\n \"identifier\": tmp_identifier,\n \"schema\": none,\n \"database\": none\n })) -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.py_current_timestring"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.654651, "supported_languages": null}, "macro.dbt_duckdb.duckdb__current_timestamp": {"name": "duckdb__current_timestamp", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__current_timestamp", "macro_sql": "{% macro duckdb__current_timestamp() -%}\n now()\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.654756, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_string_as_time": {"name": "duckdb__snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_string_as_time", "macro_sql": "{% macro duckdb__snapshot_string_as_time(timestamp) -%}\n {%- set result = \"'\" ~ timestamp ~ \"'::timestamp\" -%}\n {{ return(result) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.654958, "supported_languages": null}, "macro.dbt_duckdb.duckdb__snapshot_get_time": {"name": "duckdb__snapshot_get_time", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__snapshot_get_time", "macro_sql": "{% macro duckdb__snapshot_get_time() -%}\n {{ current_timestamp() }}::timestamp\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.655075, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_incremental_default_sql": {"name": "duckdb__get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_incremental_default_sql", "macro_sql": "{% macro duckdb__get_incremental_default_sql(arg_dict) %}\n {% do return(get_incremental_delete_insert_sql(arg_dict)) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_incremental_delete_insert_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.655243, "supported_languages": null}, "macro.dbt_duckdb.location_exists": {"name": "location_exists", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.location_exists", "macro_sql": "{% macro location_exists(location) -%}\n {% do return(adapter.location_exists(location)) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.655499, "supported_languages": null}, "macro.dbt_duckdb.write_to_file": {"name": "write_to_file", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.write_to_file", "macro_sql": "{% macro write_to_file(relation, location, options) -%}\n {% call statement('write_to_file') -%}\n copy {{ relation }} to '{{ location }}' ({{ options }})\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.655756, "supported_languages": null}, "macro.dbt_duckdb.store_relation": {"name": "store_relation", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.store_relation", "macro_sql": "{% macro store_relation(plugin, relation, location, format) -%}\n {%- set column_list = adapter.get_columns_in_relation(relation) -%}\n {% do adapter.store_relation(plugin, relation, column_list, location, format) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6560671, "supported_languages": null}, "macro.dbt_duckdb.render_write_options": {"name": "render_write_options", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/adapters.sql", "original_file_path": "macros/adapters.sql", "unique_id": "macro.dbt_duckdb.render_write_options", "macro_sql": "{% macro render_write_options(config) -%}\n {% set options = config.get('options', {}) %}\n {% for k in options %}\n {% if options[k] is string %}\n {% set _ = options.update({k: render(options[k])}) %}\n {% else %}\n {% set _ = options.update({k: render(options[k])}) %}\n {% endif %}\n {% endfor %}\n\n {# legacy top-level write options #}\n {% if config.get('format') %}\n {% set _ = options.update({'format': render(config.get('format'))}) %}\n {% endif %}\n {% if config.get('delimiter') %}\n {% set _ = options.update({'delimiter': render(config.get('delimiter'))}) %}\n {% endif %}\n\n {% do return(options) %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.667265, "supported_languages": null}, "macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql": {"name": "duckdb__get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/incremental_helper.sql", "original_file_path": "macros/incremental_helper.sql", "unique_id": "macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql", "macro_sql": "{% macro duckdb__get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not string %}\n delete from {{target }} as DBT_INCREMENTAL_TARGET\n using {{ source }}\n where (\n {% for key in unique_key %}\n {{ source }}.{{ key }} = DBT_INCREMENTAL_TARGET.{{ key }}\n {{ \"and \" if not loop.last}}\n {% endfor %}\n {% if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {% endif %}\n );\n {% else %}\n delete from {{ target }}\n where (\n {{ unique_key }}) in (\n select ({{ unique_key }})\n from {{ source }}\n )\n {%- if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {%- endif -%};\n\n {% endif %}\n {% endif %}\n\n insert into {{ target }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ source }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.685994, "supported_languages": null}, "macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns": {"name": "duckdb__alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/columns.sql", "original_file_path": "macros/columns.sql", "unique_id": "macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns", "macro_sql": "{% macro duckdb__alter_relation_add_remove_columns(relation, add_columns, remove_columns) %}\n\n {% if add_columns %}\n {% for column in add_columns %}\n {% set sql -%}\n alter {{ relation.type }} {{ relation }} add column\n {{ column.name }} {{ column.data_type }}\n {%- endset -%}\n {% do run_query(sql) %}\n {% endfor %}\n {% endif %}\n\n {% if remove_columns %}\n {% for column in remove_columns %}\n {% set sql -%}\n alter {{ relation.type }} {{ relation }} drop column\n {{ column.name }}\n {%- endset -%}\n {% do run_query(sql) %}\n {% endfor %}\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.690801, "supported_languages": null}, "macro.dbt_duckdb.materialization_table_duckdb": {"name": "materialization_table_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/table.sql", "original_file_path": "macros/materializations/table.sql", "unique_id": "macro.dbt_duckdb.materialization_table_duckdb", "macro_sql": "{% materialization table, adapter=\"duckdb\", supported_languages=['sql', 'python'] %}\n\n {%- set language = model['language'] -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') %}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main', language=language) -%}\n {{- create_table_as(False, intermediate_relation, compiled_code, language) }}\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% do create_indexes(target_relation) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt.create_indexes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.693953, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.materialization_external_duckdb": {"name": "materialization_external_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/external.sql", "original_file_path": "macros/materializations/external.sql", "unique_id": "macro.dbt_duckdb.materialization_external_duckdb", "macro_sql": "{% materialization external, adapter=\"duckdb\", supported_languages=['sql', 'python'] %}\n\n {%- set location = render(config.get('location', default=external_location(this, config))) -%})\n {%- set rendered_options = render_write_options(config) -%}\n {%- set format = config.get('format', 'parquet') -%}\n {%- set write_options = adapter.external_write_options(location, rendered_options) -%}\n {%- set read_location = adapter.external_read_location(location, rendered_options) -%}\n\n -- set language - python or sql\n {%- set language = model['language'] -%}\n\n {%- set target_relation = this.incorporate(type='view') %}\n\n -- Continue as normal materialization\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set temp_relation = make_intermediate_relation(this.incorporate(type='table'), suffix='__dbt_tmp') -%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation, suffix='__dbt_int') -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_temp_relation = load_cached_relation(temp_relation) -%}\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_temp_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('create_table', language=language) -%}\n {{- create_table_as(False, temp_relation, compiled_code, language) }}\n {%- endcall %}\n\n -- write an temp relation into file\n {{ write_to_file(temp_relation, location, write_options) }}\n -- create a view on top of the location\n {% call statement('main', language='sql') -%}\n create or replace view {{ intermediate_relation }} as (\n select * from '{{ read_location }}'\n );\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n {{ drop_relation_if_exists(temp_relation) }}\n\n -- register table into glue\n {%- set plugin_name = config.get('plugin') -%}\n {%- set glue_register = config.get('glue_register', default=false) -%}\n {% if plugin_name is not none or glue_register is true %}\n {% if glue_register %}\n {# legacy hack to set the glue database name, deprecate this #}\n {%- set plugin_name = 'glue|' ~ config.get('glue_database', 'default') -%}\n {% endif %}\n {% do store_relation(plugin_name, target_relation, location, format) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt_duckdb.external_location", "macro.dbt_duckdb.render_write_options", "macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt_duckdb.write_to_file", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt_duckdb.store_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.6996531, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.materialization_incremental_duckdb": {"name": "materialization_incremental_duckdb", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/materializations/incremental.sql", "original_file_path": "macros/materializations/incremental.sql", "unique_id": "macro.dbt_duckdb.materialization_incremental_duckdb", "macro_sql": "{% materialization incremental, adapter=\"duckdb\", supported_languages=['sql', 'python'] -%}\n\n {%- set language = model['language'] -%}\n\n -- relations\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') -%}\n {%- set temp_relation = make_temp_relation(target_relation)-%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation)-%}\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n\n -- configs\n {%- set unique_key = config.get('unique_key') -%}\n {%- set full_refresh_mode = (should_full_refresh() or existing_relation.is_view) -%}\n {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%}\n\n -- the temp_ and backup_ relations should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation. This has to happen before\n -- BEGIN, in a separate transaction\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation)-%}\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set to_drop = [] %}\n\n {% if existing_relation is none %}\n {% set build_sql = create_table_as(False, target_relation, compiled_code, language) %}\n {% elif full_refresh_mode %}\n {% set build_sql = create_table_as(False, intermediate_relation, compiled_code, language) %}\n {% set need_swap = true %}\n {% else %}\n {% if language == 'python' %}\n {% set build_python = create_table_as(False, temp_relation, compiled_code, language) %}\n {% call statement(\"pre\", language=language) %}\n {{- build_python }}\n {% endcall %}\n {% else %} {# SQL #}\n {% do run_query(create_table_as(True, temp_relation, compiled_code, language)) %}\n {% endif %}\n {% do adapter.expand_target_column_types(\n from_relation=temp_relation,\n to_relation=target_relation) %}\n {#-- Process schema changes. Returns dict of changes if successful. Use source columns for upserting/merging --#}\n {% set dest_columns = process_schema_changes(on_schema_change, temp_relation, existing_relation) %}\n {% if not dest_columns %}\n {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}\n {% endif %}\n\n {#-- Get the incremental_strategy, the macro to use for the strategy, and build the sql --#}\n {% set incremental_strategy = config.get('incremental_strategy') or 'default' %}\n {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %}\n {% set strategy_sql_macro_func = adapter.get_incremental_strategy_macro(context, incremental_strategy) %}\n {% set strategy_arg_dict = ({'target_relation': target_relation, 'temp_relation': temp_relation, 'unique_key': unique_key, 'dest_columns': dest_columns, 'incremental_predicates': incremental_predicates }) %}\n {% set build_sql = strategy_sql_macro_func(strategy_arg_dict) %}\n {% set language = \"sql\" %}\n\n {% endif %}\n\n {% call statement(\"main\", language=language) %}\n {{- build_sql }}\n {% endcall %}\n\n {% if need_swap %}\n {% do adapter.rename_relation(target_relation, backup_relation) %}\n {% do adapter.rename_relation(intermediate_relation, target_relation) %}\n {% do to_drop.append(backup_relation) %}\n {% endif %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if existing_relation is none or existing_relation.is_view or should_full_refresh() %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {% do adapter.commit() %}\n\n {% for rel in to_drop %}\n {% do adapter.drop_relation(rel) %}\n {% endfor %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_temp_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.should_full_refresh", "macro.dbt.incremental_validate_on_schema_change", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.create_table_as", "macro.dbt.statement", "macro.dbt.run_query", "macro.dbt.process_schema_changes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.705811, "supported_languages": ["sql", "python"]}, "macro.dbt_duckdb.duckdb__dateadd": {"name": "duckdb__dateadd", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt_duckdb.duckdb__dateadd", "macro_sql": "{% macro duckdb__dateadd(datepart, interval, from_date_or_timestamp) %}\n\n {{ from_date_or_timestamp }} + ((interval '1 {{ datepart }}') * ({{ interval }}))\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.706086, "supported_languages": null}, "macro.dbt_duckdb.duckdb__listagg": {"name": "duckdb__listagg", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt_duckdb.duckdb__listagg", "macro_sql": "{% macro duckdb__listagg(measure, delimiter_text, order_by_clause, limit_num) -%}\n {% if limit_num -%}\n list_aggr(\n (array_agg(\n {{ measure }}\n {% if order_by_clause -%}\n {{ order_by_clause }}\n {%- endif %}\n ))[1:{{ limit_num }}],\n 'string_agg',\n {{ delimiter_text }}\n )\n {%- else %}\n string_agg(\n {{ measure }},\n {{ delimiter_text }}\n {% if order_by_clause -%}\n {{ order_by_clause }}\n {%- endif %}\n )\n {%- endif %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.70678, "supported_languages": null}, "macro.dbt_duckdb.duckdb__datediff": {"name": "duckdb__datediff", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt_duckdb.duckdb__datediff", "macro_sql": "{% macro duckdb__datediff(first_date, second_date, datepart) -%}\n\n {% if datepart == 'year' %}\n (date_part('year', ({{second_date}})::date) - date_part('year', ({{first_date}})::date))\n {% elif datepart == 'quarter' %}\n ({{ datediff(first_date, second_date, 'year') }} * 4 + date_part('quarter', ({{second_date}})::date) - date_part('quarter', ({{first_date}})::date))\n {% elif datepart == 'month' %}\n ({{ datediff(first_date, second_date, 'year') }} * 12 + date_part('month', ({{second_date}})::date) - date_part('month', ({{first_date}})::date))\n {% elif datepart == 'day' %}\n (({{second_date}})::date - ({{first_date}})::date)\n {% elif datepart == 'week' %}\n ({{ datediff(first_date, second_date, 'day') }} / 7 + case\n when date_part('dow', ({{first_date}})::timestamp) <= date_part('dow', ({{second_date}})::timestamp) then\n case when {{first_date}} <= {{second_date}} then 0 else -1 end\n else\n case when {{first_date}} <= {{second_date}} then 1 else 0 end\n end)\n {% elif datepart == 'hour' %}\n ({{ datediff(first_date, second_date, 'day') }} * 24 + date_part('hour', ({{second_date}})::timestamp) - date_part('hour', ({{first_date}})::timestamp))\n {% elif datepart == 'minute' %}\n ({{ datediff(first_date, second_date, 'hour') }} * 60 + date_part('minute', ({{second_date}})::timestamp) - date_part('minute', ({{first_date}})::timestamp))\n {% elif datepart == 'second' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60 + floor(date_part('second', ({{second_date}})::timestamp)) - floor(date_part('second', ({{first_date}})::timestamp)))\n {% elif datepart == 'millisecond' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60000 + floor(date_part('millisecond', ({{second_date}})::timestamp)) - floor(date_part('millisecond', ({{first_date}})::timestamp)))\n {% elif datepart == 'microsecond' %}\n ({{ datediff(first_date, second_date, 'minute') }} * 60000000 + floor(date_part('microsecond', ({{second_date}})::timestamp)) - floor(date_part('microsecond', ({{first_date}})::timestamp)))\n {% else %}\n {{ exceptions.raise_compiler_error(\"Unsupported datepart for macro datediff in postgres: {!r}\".format(datepart)) }}\n {% endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.datediff"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.710272, "supported_languages": null}, "macro.dbt_duckdb.duckdb__any_value": {"name": "duckdb__any_value", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt_duckdb.duckdb__any_value", "macro_sql": "{% macro duckdb__any_value(expression) -%}\n\n arbitrary({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.71043, "supported_languages": null}, "macro.dbt_duckdb.register_upstream_external_models": {"name": "register_upstream_external_models", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/upstream.sql", "original_file_path": "macros/utils/upstream.sql", "unique_id": "macro.dbt_duckdb.register_upstream_external_models", "macro_sql": "{%- macro register_upstream_external_models() -%}\n{% if execute %}\n{% set upstream_nodes = {} %}\n{% set upstream_schemas = {} %}\n{% for node in selected_resources %}\n {% for upstream_node in graph['nodes'][node]['depends_on']['nodes'] %}\n {% if upstream_node not in upstream_nodes and upstream_node not in selected_resources %}\n {% do upstream_nodes.update({upstream_node: None}) %}\n {% set upstream = graph['nodes'].get(upstream_node) %}\n {% if upstream\n and upstream.resource_type in ('model', 'seed')\n and upstream.config.materialized=='external'\n %}\n {%- set upstream_rel = api.Relation.create(\n database=upstream['database'],\n schema=upstream['schema'],\n identifier=upstream['alias']\n ) -%}\n {%- set location = upstream.config.get('location', external_location(upstream_rel, upstream.config)) -%}\n {%- set rendered_options = render_write_options(upstream.config) -%}\n {%- set upstream_location = adapter.external_read_location(location, rendered_options) -%}\n {% if upstream_rel.schema not in upstream_schemas %}\n {% call statement('main', language='sql') -%}\n create schema if not exists {{ upstream_rel.schema }}\n {%- endcall %}\n {% do upstream_schemas.update({upstream_rel.schema: None}) %}\n {% endif %}\n {% call statement('main', language='sql') -%}\n create or replace view {{ upstream_rel }} as (\n select * from '{{ upstream_location }}'\n );\n {%- endcall %}\n {%- endif %}\n {% endif %}\n {% endfor %}\n{% endfor %}\n{% do adapter.commit() %}\n{% endif %}\n{%- endmacro -%}", "depends_on": {"macros": ["macro.dbt_duckdb.external_location", "macro.dbt_duckdb.render_write_options", "macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.713131, "supported_languages": null}, "macro.dbt_duckdb.duckdb__split_part": {"name": "duckdb__split_part", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/splitpart.sql", "original_file_path": "macros/utils/splitpart.sql", "unique_id": "macro.dbt_duckdb.duckdb__split_part", "macro_sql": "{% macro duckdb__split_part(string_text, delimiter_text, part_number) %}\n string_split({{ string_text }}, {{ delimiter_text }})[ {{ part_number }} ]\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.71337, "supported_languages": null}, "macro.dbt_duckdb.duckdb__last_day": {"name": "duckdb__last_day", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/lastday.sql", "original_file_path": "macros/utils/lastday.sql", "unique_id": "macro.dbt_duckdb.duckdb__last_day", "macro_sql": "{% macro duckdb__last_day(date, datepart) -%}\n\n {%- if datepart == 'quarter' -%}\n -- duckdb dateadd does not support quarter interval.\n cast(\n {{dbt.dateadd('day', '-1',\n dbt.dateadd('month', '3', dbt.date_trunc(datepart, date))\n )}}\n as date)\n {%- else -%}\n {{dbt.default_last_day(date, datepart)}}\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.dateadd", "macro.dbt.date_trunc", "macro.dbt.default_last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.713907, "supported_languages": null}, "macro.dbt_duckdb.external_location": {"name": "external_location", "resource_type": "macro", "package_name": "dbt_duckdb", "path": "macros/utils/external_location.sql", "original_file_path": "macros/utils/external_location.sql", "unique_id": "macro.dbt_duckdb.external_location", "macro_sql": "{%- macro external_location(relation, config) -%}\n {%- if config.get('options', {}).get('partition_by') is none -%}\n {%- set format = config.get('format', 'parquet') -%}\n {{- adapter.external_root() }}/{{ relation.identifier }}.{{ format }}\n {%- else -%}\n {{- adapter.external_root() }}/{{ relation.identifier }}\n {%- endif -%}\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7145119, "supported_languages": null}, "macro.dbt.run_hooks": {"name": "run_hooks", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.run_hooks", "macro_sql": "{% macro run_hooks(hooks, inside_transaction=True) %}\n {% for hook in hooks | selectattr('transaction', 'equalto', inside_transaction) %}\n {% if not inside_transaction and loop.first %}\n {% call statement(auto_begin=inside_transaction) %}\n commit;\n {% endcall %}\n {% endif %}\n {% set rendered = render(hook.get('sql')) | trim %}\n {% if (rendered | length) > 0 %}\n {% call statement(auto_begin=inside_transaction) %}\n {{ rendered }}\n {% endcall %}\n {% endif %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.715702, "supported_languages": null}, "macro.dbt.make_hook_config": {"name": "make_hook_config", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.make_hook_config", "macro_sql": "{% macro make_hook_config(sql, inside_transaction) %}\n {{ tojson({\"sql\": sql, \"transaction\": inside_transaction}) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.715917, "supported_languages": null}, "macro.dbt.before_begin": {"name": "before_begin", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.before_begin", "macro_sql": "{% macro before_begin(sql) %}\n {{ make_hook_config(sql, inside_transaction=False) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.716075, "supported_languages": null}, "macro.dbt.in_transaction": {"name": "in_transaction", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.in_transaction", "macro_sql": "{% macro in_transaction(sql) %}\n {{ make_hook_config(sql, inside_transaction=True) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7162318, "supported_languages": null}, "macro.dbt.after_commit": {"name": "after_commit", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/hooks.sql", "original_file_path": "macros/materializations/hooks.sql", "unique_id": "macro.dbt.after_commit", "macro_sql": "{% macro after_commit(sql) %}\n {{ make_hook_config(sql, inside_transaction=False) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_hook_config"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7163908, "supported_languages": null}, "macro.dbt.set_sql_header": {"name": "set_sql_header", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.set_sql_header", "macro_sql": "{% macro set_sql_header(config) -%}\n {{ config.set('sql_header', caller()) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7168, "supported_languages": null}, "macro.dbt.should_full_refresh": {"name": "should_full_refresh", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.should_full_refresh", "macro_sql": "{% macro should_full_refresh() %}\n {% set config_full_refresh = config.get('full_refresh') %}\n {% if config_full_refresh is none %}\n {% set config_full_refresh = flags.FULL_REFRESH %}\n {% endif %}\n {% do return(config_full_refresh) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7171252, "supported_languages": null}, "macro.dbt.should_store_failures": {"name": "should_store_failures", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/configs.sql", "original_file_path": "macros/materializations/configs.sql", "unique_id": "macro.dbt.should_store_failures", "macro_sql": "{% macro should_store_failures() %}\n {% set config_store_failures = config.get('store_failures') %}\n {% if config_store_failures is none %}\n {% set config_store_failures = flags.STORE_FAILURES %}\n {% endif %}\n {% do return(config_store_failures) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.71746, "supported_languages": null}, "macro.dbt.snapshot_merge_sql": {"name": "snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot_merge.sql", "original_file_path": "macros/materializations/snapshots/snapshot_merge.sql", "unique_id": "macro.dbt.snapshot_merge_sql", "macro_sql": "{% macro snapshot_merge_sql(target, source, insert_cols) -%}\n {{ adapter.dispatch('snapshot_merge_sql', 'dbt')(target, source, insert_cols) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.717927, "supported_languages": null}, "macro.dbt.default__snapshot_merge_sql": {"name": "default__snapshot_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot_merge.sql", "original_file_path": "macros/materializations/snapshots/snapshot_merge.sql", "unique_id": "macro.dbt.default__snapshot_merge_sql", "macro_sql": "{% macro default__snapshot_merge_sql(target, source, insert_cols) -%}\n {%- set insert_cols_csv = insert_cols | join(', ') -%}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on DBT_INTERNAL_SOURCE.dbt_scd_id = DBT_INTERNAL_DEST.dbt_scd_id\n\n when matched\n and DBT_INTERNAL_DEST.dbt_valid_to is null\n and DBT_INTERNAL_SOURCE.dbt_change_type in ('update', 'delete')\n then update\n set dbt_valid_to = DBT_INTERNAL_SOURCE.dbt_valid_to\n\n when not matched\n and DBT_INTERNAL_SOURCE.dbt_change_type = 'insert'\n then insert ({{ insert_cols_csv }})\n values ({{ insert_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.71822, "supported_languages": null}, "macro.dbt.strategy_dispatch": {"name": "strategy_dispatch", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.strategy_dispatch", "macro_sql": "{% macro strategy_dispatch(name) -%}\n{% set original_name = name %}\n {% if '.' in name %}\n {% set package_name, name = name.split(\".\", 1) %}\n {% else %}\n {% set package_name = none %}\n {% endif %}\n\n {% if package_name is none %}\n {% set package_context = context %}\n {% elif package_name in context %}\n {% set package_context = context[package_name] %}\n {% else %}\n {% set error_msg %}\n Could not find package '{{package_name}}', called with '{{original_name}}'\n {% endset %}\n {{ exceptions.raise_compiler_error(error_msg | trim) }}\n {% endif %}\n\n {%- set search_name = 'snapshot_' ~ name ~ '_strategy' -%}\n\n {% if search_name not in package_context %}\n {% set error_msg %}\n The specified strategy macro '{{name}}' was not found in package '{{ package_name }}'\n {% endset %}\n {{ exceptions.raise_compiler_error(error_msg | trim) }}\n {% endif %}\n {{ return(package_context[search_name]) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7221382, "supported_languages": null}, "macro.dbt.snapshot_hash_arguments": {"name": "snapshot_hash_arguments", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_hash_arguments", "macro_sql": "{% macro snapshot_hash_arguments(args) -%}\n {{ adapter.dispatch('snapshot_hash_arguments', 'dbt')(args) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.722322, "supported_languages": null}, "macro.dbt.default__snapshot_hash_arguments": {"name": "default__snapshot_hash_arguments", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.default__snapshot_hash_arguments", "macro_sql": "{% macro default__snapshot_hash_arguments(args) -%}\n md5({%- for arg in args -%}\n coalesce(cast({{ arg }} as varchar ), '')\n {% if not loop.last %} || '|' || {% endif %}\n {%- endfor -%})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7225611, "supported_languages": null}, "macro.dbt.snapshot_timestamp_strategy": {"name": "snapshot_timestamp_strategy", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_timestamp_strategy", "macro_sql": "{% macro snapshot_timestamp_strategy(node, snapshotted_rel, current_rel, config, target_exists) %}\n {% set primary_key = config['unique_key'] %}\n {% set updated_at = config['updated_at'] %}\n {% set invalidate_hard_deletes = config.get('invalidate_hard_deletes', false) %}\n\n {#/*\n The snapshot relation might not have an {{ updated_at }} value if the\n snapshot strategy is changed from `check` to `timestamp`. We\n should use a dbt-created column for the comparison in the snapshot\n table instead of assuming that the user-supplied {{ updated_at }}\n will be present in the historical data.\n\n See https://github.com/dbt-labs/dbt-core/issues/2350\n */ #}\n {% set row_changed_expr -%}\n ({{ snapshotted_rel }}.dbt_valid_from < {{ current_rel }}.{{ updated_at }})\n {%- endset %}\n\n {% set scd_id_expr = snapshot_hash_arguments([primary_key, updated_at]) %}\n\n {% do return({\n \"unique_key\": primary_key,\n \"updated_at\": updated_at,\n \"row_changed\": row_changed_expr,\n \"scd_id\": scd_id_expr,\n \"invalidate_hard_deletes\": invalidate_hard_deletes\n }) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7233448, "supported_languages": null}, "macro.dbt.snapshot_string_as_time": {"name": "snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_string_as_time", "macro_sql": "{% macro snapshot_string_as_time(timestamp) -%}\n {{ adapter.dispatch('snapshot_string_as_time', 'dbt')(timestamp) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_string_as_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7235231, "supported_languages": null}, "macro.dbt.default__snapshot_string_as_time": {"name": "default__snapshot_string_as_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.default__snapshot_string_as_time", "macro_sql": "{% macro default__snapshot_string_as_time(timestamp) %}\n {% do exceptions.raise_not_implemented(\n 'snapshot_string_as_time macro not implemented for adapter '+adapter.type()\n ) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7237089, "supported_languages": null}, "macro.dbt.snapshot_check_all_get_existing_columns": {"name": "snapshot_check_all_get_existing_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_check_all_get_existing_columns", "macro_sql": "{% macro snapshot_check_all_get_existing_columns(node, target_exists, check_cols_config) -%}\n {%- if not target_exists -%}\n {#-- no table yet -> return whatever the query does --#}\n {{ return((false, query_columns)) }}\n {%- endif -%}\n\n {#-- handle any schema changes --#}\n {%- set target_relation = adapter.get_relation(database=node.database, schema=node.schema, identifier=node.alias) -%}\n\n {% if check_cols_config == 'all' %}\n {%- set query_columns = get_columns_in_query(node['compiled_code']) -%}\n\n {% elif check_cols_config is iterable and (check_cols_config | length) > 0 %}\n {#-- query for proper casing/quoting, to support comparison below --#}\n {%- set select_check_cols_from_target -%}\n {#-- N.B. The whitespace below is necessary to avoid edge case issue with comments --#}\n {#-- See: https://github.com/dbt-labs/dbt-core/issues/6781 --#}\n select {{ check_cols_config | join(', ') }} from (\n {{ node['compiled_code'] }}\n ) subq\n {%- endset -%}\n {% set query_columns = get_columns_in_query(select_check_cols_from_target) %}\n\n {% else %}\n {% do exceptions.raise_compiler_error(\"Invalid value for 'check_cols': \" ~ check_cols_config) %}\n {% endif %}\n\n {%- set existing_cols = adapter.get_columns_in_relation(target_relation) | map(attribute = 'name') | list -%}\n {%- set ns = namespace() -%} {#-- handle for-loop scoping with a namespace --#}\n {%- set ns.column_added = false -%}\n\n {%- set intersection = [] -%}\n {%- for col in query_columns -%}\n {%- if col in existing_cols -%}\n {%- do intersection.append(adapter.quote(col)) -%}\n {%- else -%}\n {% set ns.column_added = true %}\n {%- endif -%}\n {%- endfor -%}\n {{ return((ns.column_added, intersection)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_columns_in_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7252312, "supported_languages": null}, "macro.dbt.snapshot_check_strategy": {"name": "snapshot_check_strategy", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/strategies.sql", "original_file_path": "macros/materializations/snapshots/strategies.sql", "unique_id": "macro.dbt.snapshot_check_strategy", "macro_sql": "{% macro snapshot_check_strategy(node, snapshotted_rel, current_rel, config, target_exists) %}\n {% set check_cols_config = config['check_cols'] %}\n {% set primary_key = config['unique_key'] %}\n {% set invalidate_hard_deletes = config.get('invalidate_hard_deletes', false) %}\n {% set updated_at = config.get('updated_at', snapshot_get_time()) %}\n\n {% set column_added = false %}\n\n {% set column_added, check_cols = snapshot_check_all_get_existing_columns(node, target_exists, check_cols_config) %}\n\n {%- set row_changed_expr -%}\n (\n {%- if column_added -%}\n {{ get_true_sql() }}\n {%- else -%}\n {%- for col in check_cols -%}\n {{ snapshotted_rel }}.{{ col }} != {{ current_rel }}.{{ col }}\n or\n (\n (({{ snapshotted_rel }}.{{ col }} is null) and not ({{ current_rel }}.{{ col }} is null))\n or\n ((not {{ snapshotted_rel }}.{{ col }} is null) and ({{ current_rel }}.{{ col }} is null))\n )\n {%- if not loop.last %} or {% endif -%}\n {%- endfor -%}\n {%- endif -%}\n )\n {%- endset %}\n\n {% set scd_id_expr = snapshot_hash_arguments([primary_key, updated_at]) %}\n\n {% do return({\n \"unique_key\": primary_key,\n \"updated_at\": updated_at,\n \"row_changed\": row_changed_expr,\n \"scd_id\": scd_id_expr,\n \"invalidate_hard_deletes\": invalidate_hard_deletes\n }) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_get_time", "macro.dbt.snapshot_check_all_get_existing_columns", "macro.dbt.get_true_sql", "macro.dbt.snapshot_hash_arguments"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7266939, "supported_languages": null}, "macro.dbt.create_columns": {"name": "create_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.create_columns", "macro_sql": "{% macro create_columns(relation, columns) %}\n {{ adapter.dispatch('create_columns', 'dbt')(relation, columns) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7312472, "supported_languages": null}, "macro.dbt.default__create_columns": {"name": "default__create_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__create_columns", "macro_sql": "{% macro default__create_columns(relation, columns) %}\n {% for column in columns %}\n {% call statement() %}\n alter table {{ relation }} add column \"{{ column.name }}\" {{ column.data_type }};\n {% endcall %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.731553, "supported_languages": null}, "macro.dbt.post_snapshot": {"name": "post_snapshot", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.post_snapshot", "macro_sql": "{% macro post_snapshot(staging_relation) %}\n {{ adapter.dispatch('post_snapshot', 'dbt')(staging_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__post_snapshot"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.731735, "supported_languages": null}, "macro.dbt.default__post_snapshot": {"name": "default__post_snapshot", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__post_snapshot", "macro_sql": "{% macro default__post_snapshot(staging_relation) %}\n {# no-op #}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.731835, "supported_languages": null}, "macro.dbt.get_true_sql": {"name": "get_true_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.get_true_sql", "macro_sql": "{% macro get_true_sql() %}\n {{ adapter.dispatch('get_true_sql', 'dbt')() }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_true_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7319932, "supported_languages": null}, "macro.dbt.default__get_true_sql": {"name": "default__get_true_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__get_true_sql", "macro_sql": "{% macro default__get_true_sql() %}\n {{ return('TRUE') }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7321222, "supported_languages": null}, "macro.dbt.snapshot_staging_table": {"name": "snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.snapshot_staging_table", "macro_sql": "{% macro snapshot_staging_table(strategy, source_sql, target_relation) -%}\n {{ adapter.dispatch('snapshot_staging_table', 'dbt')(strategy, source_sql, target_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__snapshot_staging_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7323468, "supported_languages": null}, "macro.dbt.default__snapshot_staging_table": {"name": "default__snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__snapshot_staging_table", "macro_sql": "{% macro default__snapshot_staging_table(strategy, source_sql, target_relation) -%}\n\n with snapshot_query as (\n\n {{ source_sql }}\n\n ),\n\n snapshotted_data as (\n\n select *,\n {{ strategy.unique_key }} as dbt_unique_key\n\n from {{ target_relation }}\n where dbt_valid_to is null\n\n ),\n\n insertions_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n nullif({{ strategy.updated_at }}, {{ strategy.updated_at }}) as dbt_valid_to,\n {{ strategy.scd_id }} as dbt_scd_id\n\n from snapshot_query\n ),\n\n updates_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n {{ strategy.updated_at }} as dbt_valid_to\n\n from snapshot_query\n ),\n\n {%- if strategy.invalidate_hard_deletes %}\n\n deletes_source_data as (\n\n select\n *,\n {{ strategy.unique_key }} as dbt_unique_key\n from snapshot_query\n ),\n {% endif %}\n\n insertions as (\n\n select\n 'insert' as dbt_change_type,\n source_data.*\n\n from insertions_source_data as source_data\n left outer join snapshotted_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where snapshotted_data.dbt_unique_key is null\n or (\n snapshotted_data.dbt_unique_key is not null\n and (\n {{ strategy.row_changed }}\n )\n )\n\n ),\n\n updates as (\n\n select\n 'update' as dbt_change_type,\n source_data.*,\n snapshotted_data.dbt_scd_id\n\n from updates_source_data as source_data\n join snapshotted_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where (\n {{ strategy.row_changed }}\n )\n )\n\n {%- if strategy.invalidate_hard_deletes -%}\n ,\n\n deletes as (\n\n select\n 'delete' as dbt_change_type,\n source_data.*,\n {{ snapshot_get_time() }} as dbt_valid_from,\n {{ snapshot_get_time() }} as dbt_updated_at,\n {{ snapshot_get_time() }} as dbt_valid_to,\n snapshotted_data.dbt_scd_id\n\n from snapshotted_data\n left join deletes_source_data as source_data on snapshotted_data.dbt_unique_key = source_data.dbt_unique_key\n where source_data.dbt_unique_key is null\n )\n {%- endif %}\n\n select * from insertions\n union all\n select * from updates\n {%- if strategy.invalidate_hard_deletes %}\n union all\n select * from deletes\n {%- endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.snapshot_get_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.733315, "supported_languages": null}, "macro.dbt.build_snapshot_table": {"name": "build_snapshot_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.build_snapshot_table", "macro_sql": "{% macro build_snapshot_table(strategy, sql) -%}\n {{ adapter.dispatch('build_snapshot_table', 'dbt')(strategy, sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__build_snapshot_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.733523, "supported_languages": null}, "macro.dbt.default__build_snapshot_table": {"name": "default__build_snapshot_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.default__build_snapshot_table", "macro_sql": "{% macro default__build_snapshot_table(strategy, sql) %}\n\n select *,\n {{ strategy.scd_id }} as dbt_scd_id,\n {{ strategy.updated_at }} as dbt_updated_at,\n {{ strategy.updated_at }} as dbt_valid_from,\n nullif({{ strategy.updated_at }}, {{ strategy.updated_at }}) as dbt_valid_to\n from (\n {{ sql }}\n ) sbq\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.7338, "supported_languages": null}, "macro.dbt.build_snapshot_staging_table": {"name": "build_snapshot_staging_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/helpers.sql", "original_file_path": "macros/materializations/snapshots/helpers.sql", "unique_id": "macro.dbt.build_snapshot_staging_table", "macro_sql": "{% macro build_snapshot_staging_table(strategy, sql, target_relation) %}\n {% set temp_relation = make_temp_relation(target_relation) %}\n\n {% set select = snapshot_staging_table(strategy, sql, target_relation) %}\n\n {% call statement('build_snapshot_staging_relation') %}\n {{ create_table_as(True, temp_relation, select) }}\n {% endcall %}\n\n {% do return(temp_relation) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.make_temp_relation", "macro.dbt.snapshot_staging_table", "macro.dbt.statement", "macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.734267, "supported_languages": null}, "macro.dbt.materialization_snapshot_default": {"name": "materialization_snapshot_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/snapshots/snapshot.sql", "original_file_path": "macros/materializations/snapshots/snapshot.sql", "unique_id": "macro.dbt.materialization_snapshot_default", "macro_sql": "{% materialization snapshot, default %}\n {%- set config = model['config'] -%}\n\n {%- set target_table = model.get('alias', model.get('name')) -%}\n\n {%- set strategy_name = config.get('strategy') -%}\n {%- set unique_key = config.get('unique_key') %}\n -- grab current tables grants config for comparision later on\n {%- set grant_config = config.get('grants') -%}\n\n {% set target_relation_exists, target_relation = get_or_create_relation(\n database=model.database,\n schema=model.schema,\n identifier=target_table,\n type='table') -%}\n\n {%- if not target_relation.is_table -%}\n {% do exceptions.relation_wrong_type(target_relation, 'table') %}\n {%- endif -%}\n\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set strategy_macro = strategy_dispatch(strategy_name) %}\n {% set strategy = strategy_macro(model, \"snapshotted_data\", \"source_data\", config, target_relation_exists) %}\n\n {% if not target_relation_exists %}\n\n {% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %}\n {% set final_sql = create_table_as(False, target_relation, build_sql) %}\n\n {% else %}\n\n {{ adapter.valid_snapshot_target(target_relation) }}\n\n {% set staging_table = build_snapshot_staging_table(strategy, sql, target_relation) %}\n\n -- this may no-op if the database does not require column expansion\n {% do adapter.expand_target_column_types(from_relation=staging_table,\n to_relation=target_relation) %}\n\n {% set missing_columns = adapter.get_missing_columns(staging_table, target_relation)\n | rejectattr('name', 'equalto', 'dbt_change_type')\n | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')\n | rejectattr('name', 'equalto', 'dbt_unique_key')\n | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')\n | list %}\n\n {% do create_columns(target_relation, missing_columns) %}\n\n {% set source_columns = adapter.get_columns_in_relation(staging_table)\n | rejectattr('name', 'equalto', 'dbt_change_type')\n | rejectattr('name', 'equalto', 'DBT_CHANGE_TYPE')\n | rejectattr('name', 'equalto', 'dbt_unique_key')\n | rejectattr('name', 'equalto', 'DBT_UNIQUE_KEY')\n | list %}\n\n {% set quoted_source_columns = [] %}\n {% for column in source_columns %}\n {% do quoted_source_columns.append(adapter.quote(column.name)) %}\n {% endfor %}\n\n {% set final_sql = snapshot_merge_sql(\n target = target_relation,\n source = staging_table,\n insert_cols = quoted_source_columns\n )\n %}\n\n {% endif %}\n\n {% call statement('main') %}\n {{ final_sql }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(target_relation_exists, full_refresh_mode=False) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if not target_relation_exists %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n {% if staging_table is defined %}\n {% do post_snapshot(staging_table) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.get_or_create_relation", "macro.dbt.run_hooks", "macro.dbt.strategy_dispatch", "macro.dbt.build_snapshot_table", "macro.dbt.create_table_as", "macro.dbt.build_snapshot_staging_table", "macro.dbt.create_columns", "macro.dbt.snapshot_merge_sql", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes", "macro.dbt.post_snapshot"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8057458, "supported_languages": ["sql"]}, "macro.dbt.materialization_test_default": {"name": "materialization_test_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/test.sql", "original_file_path": "macros/materializations/tests/test.sql", "unique_id": "macro.dbt.materialization_test_default", "macro_sql": "{%- materialization test, default -%}\n\n {% set relations = [] %}\n\n {% if should_store_failures() %}\n\n {% set identifier = model['alias'] %}\n {% set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) %}\n {% set target_relation = api.Relation.create(\n identifier=identifier, schema=schema, database=database, type='table') -%} %}\n\n {% if old_relation %}\n {% do adapter.drop_relation(old_relation) %}\n {% endif %}\n\n {% call statement(auto_begin=True) %}\n {{ create_table_as(False, target_relation, sql) }}\n {% endcall %}\n\n {% do relations.append(target_relation) %}\n\n {% set main_sql %}\n select *\n from {{ target_relation }}\n {% endset %}\n\n {{ adapter.commit() }}\n\n {% else %}\n\n {% set main_sql = sql %}\n\n {% endif %}\n\n {% set limit = config.get('limit') %}\n {% set fail_calc = config.get('fail_calc') %}\n {% set warn_if = config.get('warn_if') %}\n {% set error_if = config.get('error_if') %}\n\n {% call statement('main', fetch_result=True) -%}\n\n {{ get_test_sql(main_sql, fail_calc, warn_if, error_if, limit)}}\n\n {%- endcall %}\n\n {{ return({'relations': relations}) }}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.should_store_failures", "macro.dbt.statement", "macro.dbt.create_table_as", "macro.dbt.get_test_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.808222, "supported_languages": ["sql"]}, "macro.dbt.get_test_sql": {"name": "get_test_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/helpers.sql", "original_file_path": "macros/materializations/tests/helpers.sql", "unique_id": "macro.dbt.get_test_sql", "macro_sql": "{% macro get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%}\n {{ adapter.dispatch('get_test_sql', 'dbt')(main_sql, fail_calc, warn_if, error_if, limit) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_test_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8087199, "supported_languages": null}, "macro.dbt.default__get_test_sql": {"name": "default__get_test_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/helpers.sql", "original_file_path": "macros/materializations/tests/helpers.sql", "unique_id": "macro.dbt.default__get_test_sql", "macro_sql": "{% macro default__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%}\n select\n {{ fail_calc }} as failures,\n {{ fail_calc }} {{ warn_if }} as should_warn,\n {{ fail_calc }} {{ error_if }} as should_error\n from (\n {{ main_sql }}\n {{ \"limit \" ~ limit if limit != none }}\n ) dbt_internal_test\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.809055, "supported_languages": null}, "macro.dbt.get_where_subquery": {"name": "get_where_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/where_subquery.sql", "original_file_path": "macros/materializations/tests/where_subquery.sql", "unique_id": "macro.dbt.get_where_subquery", "macro_sql": "{% macro get_where_subquery(relation) -%}\n {% do return(adapter.dispatch('get_where_subquery', 'dbt')(relation)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_where_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8094652, "supported_languages": null}, "macro.dbt.default__get_where_subquery": {"name": "default__get_where_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/tests/where_subquery.sql", "original_file_path": "macros/materializations/tests/where_subquery.sql", "unique_id": "macro.dbt.default__get_where_subquery", "macro_sql": "{% macro default__get_where_subquery(relation) -%}\n {% set where = config.get('where', '') %}\n {% if where %}\n {%- set filtered -%}\n (select * from {{ relation }} where {{ where }}) dbt_subquery\n {%- endset -%}\n {% do return(filtered) %}\n {%- else -%}\n {% do return(relation) %}\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8098822, "supported_languages": null}, "macro.dbt.get_quoted_csv": {"name": "get_quoted_csv", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.get_quoted_csv", "macro_sql": "{% macro get_quoted_csv(column_names) %}\n\n {% set quoted = [] %}\n {% for col in column_names -%}\n {%- do quoted.append(adapter.quote(col)) -%}\n {%- endfor %}\n\n {%- set dest_cols_csv = quoted | join(', ') -%}\n {{ return(dest_cols_csv) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.81164, "supported_languages": null}, "macro.dbt.diff_columns": {"name": "diff_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.diff_columns", "macro_sql": "{% macro diff_columns(source_columns, target_columns) %}\n\n {% set result = [] %}\n {% set source_names = source_columns | map(attribute = 'column') | list %}\n {% set target_names = target_columns | map(attribute = 'column') | list %}\n\n {# --check whether the name attribute exists in the target - this does not perform a data type check #}\n {% for sc in source_columns %}\n {% if sc.name not in target_names %}\n {{ result.append(sc) }}\n {% endif %}\n {% endfor %}\n\n {{ return(result) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.812234, "supported_languages": null}, "macro.dbt.diff_column_data_types": {"name": "diff_column_data_types", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.diff_column_data_types", "macro_sql": "{% macro diff_column_data_types(source_columns, target_columns) %}\n\n {% set result = [] %}\n {% for sc in source_columns %}\n {% set tc = target_columns | selectattr(\"name\", \"equalto\", sc.name) | list | first %}\n {% if tc %}\n {% if sc.data_type != tc.data_type and not sc.can_expand_to(other_column=tc) %}\n {{ result.append( { 'column_name': tc.name, 'new_type': sc.data_type } ) }}\n {% endif %}\n {% endif %}\n {% endfor %}\n\n {{ return(result) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8129542, "supported_languages": null}, "macro.dbt.get_merge_update_columns": {"name": "get_merge_update_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.get_merge_update_columns", "macro_sql": "{% macro get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) %}\n {{ return(adapter.dispatch('get_merge_update_columns', 'dbt')(merge_update_columns, merge_exclude_columns, dest_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_merge_update_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.81321, "supported_languages": null}, "macro.dbt.default__get_merge_update_columns": {"name": "default__get_merge_update_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/column_helpers.sql", "original_file_path": "macros/materializations/models/incremental/column_helpers.sql", "unique_id": "macro.dbt.default__get_merge_update_columns", "macro_sql": "{% macro default__get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) %}\n {%- set default_cols = dest_columns | map(attribute=\"quoted\") | list -%}\n\n {%- if merge_update_columns and merge_exclude_columns -%}\n {{ exceptions.raise_compiler_error(\n 'Model cannot specify merge_update_columns and merge_exclude_columns. Please update model to use only one config'\n )}}\n {%- elif merge_update_columns -%}\n {%- set update_columns = merge_update_columns -%}\n {%- elif merge_exclude_columns -%}\n {%- set update_columns = [] -%}\n {%- for column in dest_columns -%}\n {% if column.column | lower not in merge_exclude_columns | map(\"lower\") | list %}\n {%- do update_columns.append(column.quoted) -%}\n {% endif %}\n {%- endfor -%}\n {%- else -%}\n {%- set update_columns = default_cols -%}\n {%- endif -%}\n\n {{ return(update_columns) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.813976, "supported_languages": null}, "macro.dbt.get_merge_sql": {"name": "get_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_merge_sql", "macro_sql": "{% macro get_merge_sql(target, source, unique_key, dest_columns, incremental_predicates=none) -%}\n -- back compat for old kwarg name\n {% set incremental_predicates = kwargs.get('predicates', incremental_predicates) %}\n {{ adapter.dispatch('get_merge_sql', 'dbt')(target, source, unique_key, dest_columns, incremental_predicates) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.841076, "supported_languages": null}, "macro.dbt.default__get_merge_sql": {"name": "default__get_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_merge_sql", "macro_sql": "{% macro default__get_merge_sql(target, source, unique_key, dest_columns, incremental_predicates=none) -%}\n {%- set predicates = [] if incremental_predicates is none else [] + incremental_predicates -%}\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n {%- set merge_update_columns = config.get('merge_update_columns') -%}\n {%- set merge_exclude_columns = config.get('merge_exclude_columns') -%}\n {%- set update_columns = get_merge_update_columns(merge_update_columns, merge_exclude_columns, dest_columns) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not mapping and unique_key is not string %}\n {% for key in unique_key %}\n {% set this_key_match %}\n DBT_INTERNAL_SOURCE.{{ key }} = DBT_INTERNAL_DEST.{{ key }}\n {% endset %}\n {% do predicates.append(this_key_match) %}\n {% endfor %}\n {% else %}\n {% set unique_key_match %}\n DBT_INTERNAL_SOURCE.{{ unique_key }} = DBT_INTERNAL_DEST.{{ unique_key }}\n {% endset %}\n {% do predicates.append(unique_key_match) %}\n {% endif %}\n {% else %}\n {% do predicates.append('FALSE') %}\n {% endif %}\n\n {{ sql_header if sql_header is not none }}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on {{\"(\" ~ predicates | join(\") and (\") ~ \")\"}}\n\n {% if unique_key %}\n when matched then update set\n {% for column_name in update_columns -%}\n {{ column_name }} = DBT_INTERNAL_SOURCE.{{ column_name }}\n {%- if not loop.last %}, {%- endif %}\n {%- endfor %}\n {% endif %}\n\n when not matched then insert\n ({{ dest_cols_csv }})\n values\n ({{ dest_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv", "macro.dbt.get_merge_update_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.845452, "supported_languages": null}, "macro.dbt.get_delete_insert_merge_sql": {"name": "get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_delete_insert_merge_sql", "macro_sql": "{% macro get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n {{ adapter.dispatch('get_delete_insert_merge_sql', 'dbt')(target, source, unique_key, dest_columns, incremental_predicates) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_delete_insert_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8457768, "supported_languages": null}, "macro.dbt.default__get_delete_insert_merge_sql": {"name": "default__get_delete_insert_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_delete_insert_merge_sql", "macro_sql": "{% macro default__get_delete_insert_merge_sql(target, source, unique_key, dest_columns, incremental_predicates) -%}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n {% if unique_key %}\n {% if unique_key is sequence and unique_key is not string %}\n delete from {{target }}\n using {{ source }}\n where (\n {% for key in unique_key %}\n {{ source }}.{{ key }} = {{ target }}.{{ key }}\n {{ \"and \" if not loop.last}}\n {% endfor %}\n {% if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {% endif %}\n );\n {% else %}\n delete from {{ target }}\n where (\n {{ unique_key }}) in (\n select ({{ unique_key }})\n from {{ source }}\n )\n {%- if incremental_predicates %}\n {% for predicate in incremental_predicates %}\n and {{ predicate }}\n {% endfor %}\n {%- endif -%};\n\n {% endif %}\n {% endif %}\n\n insert into {{ target }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ source }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.846897, "supported_languages": null}, "macro.dbt.get_insert_overwrite_merge_sql": {"name": "get_insert_overwrite_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.get_insert_overwrite_merge_sql", "macro_sql": "{% macro get_insert_overwrite_merge_sql(target, source, dest_columns, predicates, include_sql_header=false) -%}\n {{ adapter.dispatch('get_insert_overwrite_merge_sql', 'dbt')(target, source, dest_columns, predicates, include_sql_header) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_insert_overwrite_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8471959, "supported_languages": null}, "macro.dbt.default__get_insert_overwrite_merge_sql": {"name": "default__get_insert_overwrite_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/merge.sql", "original_file_path": "macros/materializations/models/incremental/merge.sql", "unique_id": "macro.dbt.default__get_insert_overwrite_merge_sql", "macro_sql": "{% macro default__get_insert_overwrite_merge_sql(target, source, dest_columns, predicates, include_sql_header) -%}\n {#-- The only time include_sql_header is True: --#}\n {#-- BigQuery + insert_overwrite strategy + \"static\" partitions config --#}\n {#-- We should consider including the sql header at the materialization level instead --#}\n\n {%- set predicates = [] if predicates is none else [] + predicates -%}\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none and include_sql_header }}\n\n merge into {{ target }} as DBT_INTERNAL_DEST\n using {{ source }} as DBT_INTERNAL_SOURCE\n on FALSE\n\n when not matched by source\n {% if predicates %} and {{ predicates | join(' and ') }} {% endif %}\n then delete\n\n when not matched then insert\n ({{ dest_cols_csv }})\n values\n ({{ dest_cols_csv }})\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8479092, "supported_languages": null}, "macro.dbt.is_incremental": {"name": "is_incremental", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/is_incremental.sql", "original_file_path": "macros/materializations/models/incremental/is_incremental.sql", "unique_id": "macro.dbt.is_incremental", "macro_sql": "{% macro is_incremental() %}\n {#-- do not run introspective queries in parsing #}\n {% if not execute %}\n {{ return(False) }}\n {% else %}\n {% set relation = adapter.get_relation(this.database, this.schema, this.table) %}\n {{ return(relation is not none\n and relation.type == 'table'\n and model.config.materialized == 'incremental'\n and not should_full_refresh()) }}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.848629, "supported_languages": null}, "macro.dbt.get_incremental_append_sql": {"name": "get_incremental_append_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_append_sql", "macro_sql": "{% macro get_incremental_append_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_append_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_append_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.849607, "supported_languages": null}, "macro.dbt.default__get_incremental_append_sql": {"name": "default__get_incremental_append_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_append_sql", "macro_sql": "{% macro default__get_incremental_append_sql(arg_dict) %}\n\n {% do return(get_insert_into_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"dest_columns\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_insert_into_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.849874, "supported_languages": null}, "macro.dbt.get_incremental_delete_insert_sql": {"name": "get_incremental_delete_insert_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_delete_insert_sql", "macro_sql": "{% macro get_incremental_delete_insert_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_delete_insert_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_delete_insert_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.881109, "supported_languages": null}, "macro.dbt.default__get_incremental_delete_insert_sql": {"name": "default__get_incremental_delete_insert_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_delete_insert_sql", "macro_sql": "{% macro default__get_incremental_delete_insert_sql(arg_dict) %}\n\n {% do return(get_delete_insert_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"unique_key\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_delete_insert_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.885153, "supported_languages": null}, "macro.dbt.get_incremental_merge_sql": {"name": "get_incremental_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_merge_sql", "macro_sql": "{% macro get_incremental_merge_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_merge_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.885768, "supported_languages": null}, "macro.dbt.default__get_incremental_merge_sql": {"name": "default__get_incremental_merge_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_merge_sql", "macro_sql": "{% macro default__get_incremental_merge_sql(arg_dict) %}\n\n {% do return(get_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"unique_key\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.887551, "supported_languages": null}, "macro.dbt.get_incremental_insert_overwrite_sql": {"name": "get_incremental_insert_overwrite_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_insert_overwrite_sql", "macro_sql": "{% macro get_incremental_insert_overwrite_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_insert_overwrite_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_incremental_insert_overwrite_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.89052, "supported_languages": null}, "macro.dbt.default__get_incremental_insert_overwrite_sql": {"name": "default__get_incremental_insert_overwrite_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_insert_overwrite_sql", "macro_sql": "{% macro default__get_incremental_insert_overwrite_sql(arg_dict) %}\n\n {% do return(get_insert_overwrite_merge_sql(arg_dict[\"target_relation\"], arg_dict[\"temp_relation\"], arg_dict[\"dest_columns\"], arg_dict[\"incremental_predicates\"])) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_insert_overwrite_merge_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8908842, "supported_languages": null}, "macro.dbt.get_incremental_default_sql": {"name": "get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_incremental_default_sql", "macro_sql": "{% macro get_incremental_default_sql(arg_dict) %}\n\n {{ return(adapter.dispatch('get_incremental_default_sql', 'dbt')(arg_dict)) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_incremental_default_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8940082, "supported_languages": null}, "macro.dbt.default__get_incremental_default_sql": {"name": "default__get_incremental_default_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.default__get_incremental_default_sql", "macro_sql": "{% macro default__get_incremental_default_sql(arg_dict) %}\n\n {% do return(get_incremental_append_sql(arg_dict)) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_incremental_append_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8942301, "supported_languages": null}, "macro.dbt.get_insert_into_sql": {"name": "get_insert_into_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/strategies.sql", "original_file_path": "macros/materializations/models/incremental/strategies.sql", "unique_id": "macro.dbt.get_insert_into_sql", "macro_sql": "{% macro get_insert_into_sql(target_relation, temp_relation, dest_columns) %}\n\n {%- set dest_cols_csv = get_quoted_csv(dest_columns | map(attribute=\"name\")) -%}\n\n insert into {{ target_relation }} ({{ dest_cols_csv }})\n (\n select {{ dest_cols_csv }}\n from {{ temp_relation }}\n )\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_quoted_csv"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.8945558, "supported_languages": null}, "macro.dbt.materialization_incremental_default": {"name": "materialization_incremental_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/incremental.sql", "original_file_path": "macros/materializations/models/incremental/incremental.sql", "unique_id": "macro.dbt.materialization_incremental_default", "macro_sql": "{% materialization incremental, default -%}\n\n -- relations\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') -%}\n {%- set temp_relation = make_temp_relation(target_relation)-%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation)-%}\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n\n -- configs\n {%- set unique_key = config.get('unique_key') -%}\n {%- set full_refresh_mode = (should_full_refresh() or existing_relation.is_view) -%}\n {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%}\n\n -- the temp_ and backup_ relations should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation. This has to happen before\n -- BEGIN, in a separate transaction\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation)-%}\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set to_drop = [] %}\n\n {% if existing_relation is none %}\n {% set build_sql = get_create_table_as_sql(False, target_relation, sql) %}\n {% elif full_refresh_mode %}\n {% set build_sql = get_create_table_as_sql(False, intermediate_relation, sql) %}\n {% set need_swap = true %}\n {% else %}\n {% do run_query(get_create_table_as_sql(True, temp_relation, sql)) %}\n {% do adapter.expand_target_column_types(\n from_relation=temp_relation,\n to_relation=target_relation) %}\n {#-- Process schema changes. Returns dict of changes if successful. Use source columns for upserting/merging --#}\n {% set dest_columns = process_schema_changes(on_schema_change, temp_relation, existing_relation) %}\n {% if not dest_columns %}\n {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}\n {% endif %}\n\n {#-- Get the incremental_strategy, the macro to use for the strategy, and build the sql --#}\n {% set incremental_strategy = config.get('incremental_strategy') or 'default' %}\n {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %}\n {% set strategy_sql_macro_func = adapter.get_incremental_strategy_macro(context, incremental_strategy) %}\n {% set strategy_arg_dict = ({'target_relation': target_relation, 'temp_relation': temp_relation, 'unique_key': unique_key, 'dest_columns': dest_columns, 'incremental_predicates': incremental_predicates }) %}\n {% set build_sql = strategy_sql_macro_func(strategy_arg_dict) %}\n\n {% endif %}\n\n {% call statement(\"main\") %}\n {{ build_sql }}\n {% endcall %}\n\n {% if need_swap %}\n {% do adapter.rename_relation(target_relation, backup_relation) %}\n {% do adapter.rename_relation(intermediate_relation, target_relation) %}\n {% do to_drop.append(backup_relation) %}\n {% endif %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if existing_relation is none or existing_relation.is_view or should_full_refresh() %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {% do adapter.commit() %}\n\n {% for rel in to_drop %}\n {% do adapter.drop_relation(rel) %}\n {% endfor %}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_temp_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.should_full_refresh", "macro.dbt.incremental_validate_on_schema_change", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.get_create_table_as_sql", "macro.dbt.run_query", "macro.dbt.process_schema_changes", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.908556, "supported_languages": ["sql"]}, "macro.dbt.incremental_validate_on_schema_change": {"name": "incremental_validate_on_schema_change", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.incremental_validate_on_schema_change", "macro_sql": "{% macro incremental_validate_on_schema_change(on_schema_change, default='ignore') %}\n\n {% if on_schema_change not in ['sync_all_columns', 'append_new_columns', 'fail', 'ignore'] %}\n\n {% set log_message = 'Invalid value for on_schema_change (%s) specified. Setting default value of %s.' % (on_schema_change, default) %}\n {% do log(log_message) %}\n\n {{ return(default) }}\n\n {% else %}\n\n {{ return(on_schema_change) }}\n\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.915031, "supported_languages": null}, "macro.dbt.check_for_schema_changes": {"name": "check_for_schema_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.check_for_schema_changes", "macro_sql": "{% macro check_for_schema_changes(source_relation, target_relation) %}\n\n {% set schema_changed = False %}\n\n {%- set source_columns = adapter.get_columns_in_relation(source_relation) -%}\n {%- set target_columns = adapter.get_columns_in_relation(target_relation) -%}\n {%- set source_not_in_target = diff_columns(source_columns, target_columns) -%}\n {%- set target_not_in_source = diff_columns(target_columns, source_columns) -%}\n\n {% set new_target_types = diff_column_data_types(source_columns, target_columns) %}\n\n {% if source_not_in_target != [] %}\n {% set schema_changed = True %}\n {% elif target_not_in_source != [] or new_target_types != [] %}\n {% set schema_changed = True %}\n {% elif new_target_types != [] %}\n {% set schema_changed = True %}\n {% endif %}\n\n {% set changes_dict = {\n 'schema_changed': schema_changed,\n 'source_not_in_target': source_not_in_target,\n 'target_not_in_source': target_not_in_source,\n 'source_columns': source_columns,\n 'target_columns': target_columns,\n 'new_target_types': new_target_types\n } %}\n\n {% set msg %}\n In {{ target_relation }}:\n Schema changed: {{ schema_changed }}\n Source columns not in target: {{ source_not_in_target }}\n Target columns not in source: {{ target_not_in_source }}\n New column types: {{ new_target_types }}\n {% endset %}\n\n {% do log(msg) %}\n\n {{ return(changes_dict) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.diff_columns", "macro.dbt.diff_column_data_types"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.916381, "supported_languages": null}, "macro.dbt.sync_column_schemas": {"name": "sync_column_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.sync_column_schemas", "macro_sql": "{% macro sync_column_schemas(on_schema_change, target_relation, schema_changes_dict) %}\n\n {%- set add_to_target_arr = schema_changes_dict['source_not_in_target'] -%}\n\n {%- if on_schema_change == 'append_new_columns'-%}\n {%- if add_to_target_arr | length > 0 -%}\n {%- do alter_relation_add_remove_columns(target_relation, add_to_target_arr, none) -%}\n {%- endif -%}\n\n {% elif on_schema_change == 'sync_all_columns' %}\n {%- set remove_from_target_arr = schema_changes_dict['target_not_in_source'] -%}\n {%- set new_target_types = schema_changes_dict['new_target_types'] -%}\n\n {% if add_to_target_arr | length > 0 or remove_from_target_arr | length > 0 %}\n {%- do alter_relation_add_remove_columns(target_relation, add_to_target_arr, remove_from_target_arr) -%}\n {% endif %}\n\n {% if new_target_types != [] %}\n {% for ntt in new_target_types %}\n {% set column_name = ntt['column_name'] %}\n {% set new_type = ntt['new_type'] %}\n {% do alter_column_type(target_relation, column_name, new_type) %}\n {% endfor %}\n {% endif %}\n\n {% endif %}\n\n {% set schema_change_message %}\n In {{ target_relation }}:\n Schema change approach: {{ on_schema_change }}\n Columns added: {{ add_to_target_arr }}\n Columns removed: {{ remove_from_target_arr }}\n Data types changed: {{ new_target_types }}\n {% endset %}\n\n {% do log(schema_change_message) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.alter_relation_add_remove_columns", "macro.dbt.alter_column_type"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.917696, "supported_languages": null}, "macro.dbt.process_schema_changes": {"name": "process_schema_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/incremental/on_schema_change.sql", "original_file_path": "macros/materializations/models/incremental/on_schema_change.sql", "unique_id": "macro.dbt.process_schema_changes", "macro_sql": "{% macro process_schema_changes(on_schema_change, source_relation, target_relation) %}\n\n {% if on_schema_change == 'ignore' %}\n\n {{ return({}) }}\n\n {% else %}\n\n {% set schema_changes_dict = check_for_schema_changes(source_relation, target_relation) %}\n\n {% if schema_changes_dict['schema_changed'] %}\n\n {% if on_schema_change == 'fail' %}\n\n {% set fail_msg %}\n The source and target schemas on this incremental model are out of sync!\n They can be reconciled in several ways:\n - set the `on_schema_change` config to either append_new_columns or sync_all_columns, depending on your situation.\n - Re-run the incremental model with `full_refresh: True` to update the target schema.\n - update the schema manually and re-run the process.\n\n Additional troubleshooting context:\n Source columns not in target: {{ schema_changes_dict['source_not_in_target'] }}\n Target columns not in source: {{ schema_changes_dict['target_not_in_source'] }}\n New column types: {{ schema_changes_dict['new_target_types'] }}\n {% endset %}\n\n {% do exceptions.raise_compiler_error(fail_msg) %}\n\n {# -- unless we ignore, run the sync operation per the config #}\n {% else %}\n\n {% do sync_column_schemas(on_schema_change, target_relation, schema_changes_dict) %}\n\n {% endif %}\n\n {% endif %}\n\n {{ return(schema_changes_dict['source_columns']) }}\n\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.check_for_schema_changes", "macro.dbt.sync_column_schemas"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.918633, "supported_languages": null}, "macro.dbt.materialization_materialized_view_default": {"name": "materialization_materialized_view_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialization_materialized_view_default", "macro_sql": "{% materialization materialized_view, default %}\n {% set existing_relation = load_cached_relation(this) %}\n {% set target_relation = this.incorporate(type=this.MaterializedView) %}\n {% set intermediate_relation = make_intermediate_relation(target_relation) %}\n {% set backup_relation_type = target_relation.MaterializedView if existing_relation is none else existing_relation.type %}\n {% set backup_relation = make_backup_relation(target_relation, backup_relation_type) %}\n\n {{ materialized_view_setup(backup_relation, intermediate_relation, pre_hooks) }}\n\n {% set build_sql = materialized_view_get_build_sql(existing_relation, target_relation, backup_relation, intermediate_relation) %}\n\n {% if build_sql == '' %}\n {{ materialized_view_execute_no_op(target_relation) }}\n {% else %}\n {{ materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) }}\n {% endif %}\n\n {{ materialized_view_teardown(backup_relation, intermediate_relation, post_hooks) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.materialized_view_setup", "macro.dbt.materialized_view_get_build_sql", "macro.dbt.materialized_view_execute_no_op", "macro.dbt.materialized_view_execute_build_sql", "macro.dbt.materialized_view_teardown"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.962682, "supported_languages": ["sql"]}, "macro.dbt.materialized_view_setup": {"name": "materialized_view_setup", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_setup", "macro_sql": "{% macro materialized_view_setup(backup_relation, intermediate_relation, pre_hooks) %}\n\n -- backup_relation and intermediate_relation should not already exist in the database\n -- it's possible these exist because of a previous run that exited unexpectedly\n {% set preexisting_backup_relation = load_cached_relation(backup_relation) %}\n {% set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.963132, "supported_languages": null}, "macro.dbt.materialized_view_teardown": {"name": "materialized_view_teardown", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_teardown", "macro_sql": "{% macro materialized_view_teardown(backup_relation, intermediate_relation, post_hooks) %}\n\n -- drop the temp relations if they exist to leave the database clean for the next run\n {{ drop_relation_if_exists(backup_relation) }}\n {{ drop_relation_if_exists(intermediate_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.963414, "supported_languages": null}, "macro.dbt.materialized_view_get_build_sql": {"name": "materialized_view_get_build_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_get_build_sql", "macro_sql": "{% macro materialized_view_get_build_sql(existing_relation, target_relation, backup_relation, intermediate_relation) %}\n\n {% set full_refresh_mode = should_full_refresh() %}\n\n -- determine the scenario we're in: create, full_refresh, alter, refresh data\n {% if existing_relation is none %}\n {% set build_sql = get_create_materialized_view_as_sql(target_relation, sql) %}\n {% elif full_refresh_mode or not existing_relation.is_materialized_view %}\n {% set build_sql = get_replace_materialized_view_as_sql(target_relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {% else %}\n\n -- get config options\n {% set on_configuration_change = config.get('on_configuration_change') %}\n {% set configuration_changes = get_materialized_view_configuration_changes(existing_relation, config) %}\n\n {% if configuration_changes is none %}\n {% set build_sql = refresh_materialized_view(target_relation) %}\n\n {% elif on_configuration_change == 'apply' %}\n {% set build_sql = get_alter_materialized_view_as_sql(target_relation, configuration_changes, sql, existing_relation, backup_relation, intermediate_relation) %}\n {% elif on_configuration_change == 'continue' %}\n {% set build_sql = '' %}\n {{ exceptions.warn(\"Configuration changes were identified and `on_configuration_change` was set to `continue` for `\" ~ target_relation ~ \"`\") }}\n {% elif on_configuration_change == 'fail' %}\n {{ exceptions.raise_fail_fast_error(\"Configuration changes were identified and `on_configuration_change` was set to `fail` for `\" ~ target_relation ~ \"`\") }}\n\n {% else %}\n -- this only happens if the user provides a value other than `apply`, 'skip', 'fail'\n {{ exceptions.raise_compiler_error(\"Unexpected configuration scenario\") }}\n\n {% endif %}\n\n {% endif %}\n\n {% do return(build_sql) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh", "macro.dbt.get_create_materialized_view_as_sql", "macro.dbt.get_replace_materialized_view_as_sql", "macro.dbt.get_materialized_view_configuration_changes", "macro.dbt.refresh_materialized_view", "macro.dbt.get_alter_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.9649112, "supported_languages": null}, "macro.dbt.materialized_view_execute_no_op": {"name": "materialized_view_execute_no_op", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_execute_no_op", "macro_sql": "{% macro materialized_view_execute_no_op(target_relation) %}\n {% do store_raw_result(\n name=\"main\",\n message=\"skip \" ~ target_relation,\n code=\"skip\",\n rows_affected=\"-1\"\n ) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.9651809, "supported_languages": null}, "macro.dbt.materialized_view_execute_build_sql": {"name": "materialized_view_execute_build_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/materialized_view.sql", "unique_id": "macro.dbt.materialized_view_execute_build_sql", "macro_sql": "{% macro materialized_view_execute_build_sql(build_sql, existing_relation, target_relation, post_hooks) %}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n {% set grant_config = config.get('grants') %}\n\n {% call statement(name=\"main\") %}\n {{ build_sql }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.965919, "supported_languages": null}, "macro.dbt.get_materialized_view_configuration_changes": {"name": "get_materialized_view_configuration_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "original_file_path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "unique_id": "macro.dbt.get_materialized_view_configuration_changes", "macro_sql": "{% macro get_materialized_view_configuration_changes(existing_relation, new_config) %}\n /* {#\n It's recommended that configuration changes be formatted as follows:\n {\"\": [{\"action\": \"\", \"context\": ...}]}\n\n For example:\n {\n \"indexes\": [\n {\"action\": \"drop\", \"context\": \"index_abc\"},\n {\"action\": \"create\", \"context\": {\"columns\": [\"column_1\", \"column_2\"], \"type\": \"hash\", \"unique\": True}},\n ],\n }\n\n Either way, `get_materialized_view_configuration_changes` needs to align with `get_alter_materialized_view_as_sql`.\n #} */\n {{- log('Determining configuration changes on: ' ~ existing_relation) -}}\n {%- do return(adapter.dispatch('get_materialized_view_configuration_changes', 'dbt')(existing_relation, new_config)) -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_materialized_view_configuration_changes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.96641, "supported_languages": null}, "macro.dbt.default__get_materialized_view_configuration_changes": {"name": "default__get_materialized_view_configuration_changes", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "original_file_path": "macros/materializations/models/materialized_view/get_materialized_view_configuration_changes.sql", "unique_id": "macro.dbt.default__get_materialized_view_configuration_changes", "macro_sql": "{% macro default__get_materialized_view_configuration_changes(existing_relation, new_config) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.966579, "supported_languages": null}, "macro.dbt.get_alter_materialized_view_as_sql": {"name": "get_alter_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "unique_id": "macro.dbt.get_alter_materialized_view_as_sql", "macro_sql": "{% macro get_alter_materialized_view_as_sql(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n) %}\n {{- log('Applying ALTER to: ' ~ relation) -}}\n {{- adapter.dispatch('get_alter_materialized_view_as_sql', 'dbt')(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n ) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_alter_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.9671302, "supported_languages": null}, "macro.dbt.default__get_alter_materialized_view_as_sql": {"name": "default__get_alter_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/alter_materialized_view.sql", "unique_id": "macro.dbt.default__get_alter_materialized_view_as_sql", "macro_sql": "{% macro default__get_alter_materialized_view_as_sql(\n relation,\n configuration_changes,\n sql,\n existing_relation,\n backup_relation,\n intermediate_relation\n) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.967341, "supported_languages": null}, "macro.dbt.refresh_materialized_view": {"name": "refresh_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "unique_id": "macro.dbt.refresh_materialized_view", "macro_sql": "{% macro refresh_materialized_view(relation) %}\n {{- log('Applying REFRESH to: ' ~ relation) -}}\n {{- adapter.dispatch('refresh_materialized_view', 'dbt')(relation) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__refresh_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.989443, "supported_languages": null}, "macro.dbt.default__refresh_materialized_view": {"name": "default__refresh_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/refresh_materialized_view.sql", "unique_id": "macro.dbt.default__refresh_materialized_view", "macro_sql": "{% macro default__refresh_materialized_view(relation) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.993332, "supported_languages": null}, "macro.dbt.get_replace_materialized_view_as_sql": {"name": "get_replace_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "unique_id": "macro.dbt.get_replace_materialized_view_as_sql", "macro_sql": "{% macro get_replace_materialized_view_as_sql(relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {{- log('Applying REPLACE to: ' ~ relation) -}}\n {{- adapter.dispatch('get_replace_materialized_view_as_sql', 'dbt')(relation, sql, existing_relation, backup_relation, intermediate_relation) -}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_replace_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.993862, "supported_languages": null}, "macro.dbt.default__get_replace_materialized_view_as_sql": {"name": "default__get_replace_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/replace_materialized_view.sql", "unique_id": "macro.dbt.default__get_replace_materialized_view_as_sql", "macro_sql": "{% macro default__get_replace_materialized_view_as_sql(relation, sql, existing_relation, backup_relation, intermediate_relation) %}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.994166, "supported_languages": null}, "macro.dbt.get_create_materialized_view_as_sql": {"name": "get_create_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "unique_id": "macro.dbt.get_create_materialized_view_as_sql", "macro_sql": "{% macro get_create_materialized_view_as_sql(relation, sql) -%}\n {{- log('Applying CREATE to: ' ~ relation) -}}\n {{- adapter.dispatch('get_create_materialized_view_as_sql', 'dbt')(relation, sql) -}}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_materialized_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.994543, "supported_languages": null}, "macro.dbt.default__get_create_materialized_view_as_sql": {"name": "default__get_create_materialized_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "original_file_path": "macros/materializations/models/materialized_view/create_materialized_view.sql", "unique_id": "macro.dbt.default__get_create_materialized_view_as_sql", "macro_sql": "{% macro default__get_create_materialized_view_as_sql(relation, sql) -%}\n {{ exceptions.raise_compiler_error(\"Materialized views have not been implemented for this adapter.\") }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.994703, "supported_languages": null}, "macro.dbt.can_clone_table": {"name": "can_clone_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/can_clone_table.sql", "original_file_path": "macros/materializations/models/clone/can_clone_table.sql", "unique_id": "macro.dbt.can_clone_table", "macro_sql": "{% macro can_clone_table() %}\n {{ return(adapter.dispatch('can_clone_table', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__can_clone_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.994959, "supported_languages": null}, "macro.dbt.default__can_clone_table": {"name": "default__can_clone_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/can_clone_table.sql", "original_file_path": "macros/materializations/models/clone/can_clone_table.sql", "unique_id": "macro.dbt.default__can_clone_table", "macro_sql": "{% macro default__can_clone_table() %}\n {{ return(False) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.995091, "supported_languages": null}, "macro.dbt.create_or_replace_clone": {"name": "create_or_replace_clone", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/create_or_replace_clone.sql", "original_file_path": "macros/materializations/models/clone/create_or_replace_clone.sql", "unique_id": "macro.dbt.create_or_replace_clone", "macro_sql": "{% macro create_or_replace_clone(this_relation, defer_relation) %}\n {{ return(adapter.dispatch('create_or_replace_clone', 'dbt')(this_relation, defer_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_or_replace_clone"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.9954078, "supported_languages": null}, "macro.dbt.default__create_or_replace_clone": {"name": "default__create_or_replace_clone", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/create_or_replace_clone.sql", "original_file_path": "macros/materializations/models/clone/create_or_replace_clone.sql", "unique_id": "macro.dbt.default__create_or_replace_clone", "macro_sql": "{% macro default__create_or_replace_clone(this_relation, defer_relation) %}\n create or replace table {{ this_relation }} clone {{ defer_relation }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.995557, "supported_languages": null}, "macro.dbt.materialization_clone_default": {"name": "materialization_clone_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/clone/clone.sql", "original_file_path": "macros/materializations/models/clone/clone.sql", "unique_id": "macro.dbt.materialization_clone_default", "macro_sql": "{%- materialization clone, default -%}\n\n {%- set relations = {'relations': []} -%}\n\n {%- if not defer_relation -%}\n -- nothing to do\n {{ log(\"No relation found in state manifest for \" ~ model.unique_id, info=True) }}\n {{ return(relations) }}\n {%- endif -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n\n {%- if existing_relation and not flags.FULL_REFRESH -%}\n -- noop!\n {{ log(\"Relation \" ~ existing_relation ~ \" already exists\", info=True) }}\n {{ return(relations) }}\n {%- endif -%}\n\n {%- set other_existing_relation = load_cached_relation(defer_relation) -%}\n\n -- If this is a database that can do zero-copy cloning of tables, and the other relation is a table, then this will be a table\n -- Otherwise, this will be a view\n\n {% set can_clone_table = can_clone_table() %}\n\n {%- if other_existing_relation and other_existing_relation.type == 'table' and can_clone_table -%}\n\n {%- set target_relation = this.incorporate(type='table') -%}\n {% if existing_relation is not none and not existing_relation.is_table %}\n {{ log(\"Dropping relation \" ~ existing_relation ~ \" because it is of type \" ~ existing_relation.type) }}\n {{ drop_relation_if_exists(existing_relation) }}\n {% endif %}\n\n -- as a general rule, data platforms that can clone tables can also do atomic 'create or replace'\n {% call statement('main') %}\n {{ create_or_replace_clone(target_relation, defer_relation) }}\n {% endcall %}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n {% do persist_docs(target_relation, model) %}\n\n {{ return({'relations': [target_relation]}) }}\n\n {%- else -%}\n\n {%- set target_relation = this.incorporate(type='view') -%}\n\n -- reuse the view materialization\n -- TODO: support actual dispatch for materialization macros\n -- Tracking ticket: https://github.com/dbt-labs/dbt-core/issues/7799\n {% set search_name = \"materialization_view_\" ~ adapter.type() %}\n {% if not search_name in context %}\n {% set search_name = \"materialization_view_default\" %}\n {% endif %}\n {% set materialization_macro = context[search_name] %}\n {% set relations = materialization_macro() %}\n {{ return(relations) }}\n\n {%- endif -%}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.can_clone_table", "macro.dbt.drop_relation_if_exists", "macro.dbt.statement", "macro.dbt.create_or_replace_clone", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065876.9990652, "supported_languages": ["sql"]}, "macro.dbt.get_table_columns_and_constraints": {"name": "get_table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.get_table_columns_and_constraints", "macro_sql": "{%- macro get_table_columns_and_constraints() -%}\n {{ adapter.dispatch('get_table_columns_and_constraints', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__get_table_columns_and_constraints"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0002098, "supported_languages": null}, "macro.dbt.default__get_table_columns_and_constraints": {"name": "default__get_table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__get_table_columns_and_constraints", "macro_sql": "{% macro default__get_table_columns_and_constraints() -%}\n {{ return(table_columns_and_constraints()) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.table_columns_and_constraints"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0003462, "supported_languages": null}, "macro.dbt.table_columns_and_constraints": {"name": "table_columns_and_constraints", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.table_columns_and_constraints", "macro_sql": "{% macro table_columns_and_constraints() %}\n {# loop through user_provided_columns to create DDL with data types and constraints #}\n {%- set raw_column_constraints = adapter.render_raw_columns_constraints(raw_columns=model['columns']) -%}\n {%- set raw_model_constraints = adapter.render_raw_model_constraints(raw_constraints=model['constraints']) -%}\n (\n {% for c in raw_column_constraints -%}\n {{ c }}{{ \",\" if not loop.last or raw_model_constraints }}\n {% endfor %}\n {% for c in raw_model_constraints -%}\n {{ c }}{{ \",\" if not loop.last }}\n {% endfor -%}\n )\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0009432, "supported_languages": null}, "macro.dbt.get_assert_columns_equivalent": {"name": "get_assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.get_assert_columns_equivalent", "macro_sql": "\n\n{%- macro get_assert_columns_equivalent(sql) -%}\n {{ adapter.dispatch('get_assert_columns_equivalent', 'dbt')(sql) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.001129, "supported_languages": null}, "macro.dbt.default__get_assert_columns_equivalent": {"name": "default__get_assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__get_assert_columns_equivalent", "macro_sql": "{% macro default__get_assert_columns_equivalent(sql) -%}\n {{ return(assert_columns_equivalent(sql)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.001282, "supported_languages": null}, "macro.dbt.assert_columns_equivalent": {"name": "assert_columns_equivalent", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.assert_columns_equivalent", "macro_sql": "{% macro assert_columns_equivalent(sql) %}\n\n {#-- First ensure the user has defined 'columns' in yaml specification --#}\n {%- set user_defined_columns = model['columns'] -%}\n {%- if not user_defined_columns -%}\n {{ exceptions.raise_contract_error([], []) }}\n {%- endif -%}\n\n {#-- Obtain the column schema provided by sql file. #}\n {%- set sql_file_provided_columns = get_column_schema_from_query(sql, config.get('sql_header', none)) -%}\n {#--Obtain the column schema provided by the schema file by generating an 'empty schema' query from the model's columns. #}\n {%- set schema_file_provided_columns = get_column_schema_from_query(get_empty_schema_sql(user_defined_columns)) -%}\n\n {#-- create dictionaries with name and formatted data type and strings for exception #}\n {%- set sql_columns = format_columns(sql_file_provided_columns) -%}\n {%- set yaml_columns = format_columns(schema_file_provided_columns) -%}\n\n {%- if sql_columns|length != yaml_columns|length -%}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n\n {%- for sql_col in sql_columns -%}\n {%- set yaml_col = [] -%}\n {%- for this_col in yaml_columns -%}\n {%- if this_col['name'] == sql_col['name'] -%}\n {%- do yaml_col.append(this_col) -%}\n {%- break -%}\n {%- endif -%}\n {%- endfor -%}\n {%- if not yaml_col -%}\n {#-- Column with name not found in yaml #}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n {%- if sql_col['formatted'] != yaml_col[0]['formatted'] -%}\n {#-- Column data types don't match #}\n {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%}\n {%- endif -%}\n {%- endfor -%}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_column_schema_from_query", "macro.dbt.get_empty_schema_sql", "macro.dbt.format_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.002753, "supported_languages": null}, "macro.dbt.format_columns": {"name": "format_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.format_columns", "macro_sql": "{% macro format_columns(columns) %}\n {% set formatted_columns = [] %}\n {% for column in columns %}\n {%- set formatted_column = adapter.dispatch('format_column', 'dbt')(column) -%}\n {%- do formatted_columns.append(formatted_column) -%}\n {% endfor %}\n {{ return(formatted_columns) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__format_column"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.017057, "supported_languages": null}, "macro.dbt.default__format_column": {"name": "default__format_column", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/columns_spec_ddl.sql", "original_file_path": "macros/materializations/models/table/columns_spec_ddl.sql", "unique_id": "macro.dbt.default__format_column", "macro_sql": "{% macro default__format_column(column) -%}\n {% set data_type = column.dtype %}\n {% set formatted = column.column.lower() ~ \" \" ~ data_type %}\n {{ return({'name': column.name, 'data_type': data_type, 'formatted': formatted}) }}\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.017498, "supported_languages": null}, "macro.dbt.materialization_table_default": {"name": "materialization_table_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/table.sql", "original_file_path": "macros/materializations/models/table/table.sql", "unique_id": "macro.dbt.materialization_table_default", "macro_sql": "{% materialization table, default %}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='table') %}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n See ../view/view.sql for more information about this relation.\n */\n {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_table_as_sql(False, intermediate_relation, sql) }}\n {%- endcall %}\n\n -- cleanup\n {% if existing_relation is not none %}\n /* Do the equivalent of rename_if_exists. 'existing_relation' could have been dropped\n since the variable was first set. */\n {% set existing_relation = load_cached_relation(existing_relation) %}\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n {% endif %}\n\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% do create_indexes(target_relation) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n -- finally, drop the existing/backup relation after the commit\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.drop_relation_if_exists", "macro.dbt.run_hooks", "macro.dbt.statement", "macro.dbt.get_create_table_as_sql", "macro.dbt.create_indexes", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0235689, "supported_languages": ["sql"]}, "macro.dbt.get_create_table_as_sql": {"name": "get_create_table_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.get_create_table_as_sql", "macro_sql": "{% macro get_create_table_as_sql(temporary, relation, sql) -%}\n {{ adapter.dispatch('get_create_table_as_sql', 'dbt')(temporary, relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_table_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.024558, "supported_languages": null}, "macro.dbt.default__get_create_table_as_sql": {"name": "default__get_create_table_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_create_table_as_sql", "macro_sql": "{% macro default__get_create_table_as_sql(temporary, relation, sql) -%}\n {{ return(create_table_as(temporary, relation, sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.024767, "supported_languages": null}, "macro.dbt.create_table_as": {"name": "create_table_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.create_table_as", "macro_sql": "{% macro create_table_as(temporary, relation, compiled_code, language='sql') -%}\n {# backward compatibility for create_table_as that does not support language #}\n {% if language == \"sql\" %}\n {{ adapter.dispatch('create_table_as', 'dbt')(temporary, relation, compiled_code)}}\n {% else %}\n {{ adapter.dispatch('create_table_as', 'dbt')(temporary, relation, compiled_code, language) }}\n {% endif %}\n\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_table_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0252411, "supported_languages": null}, "macro.dbt.default__create_table_as": {"name": "default__create_table_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__create_table_as", "macro_sql": "{% macro default__create_table_as(temporary, relation, sql) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n\n create {% if temporary: -%}temporary{%- endif %} table\n {{ relation.include(database=(not temporary), schema=(not temporary)) }}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced and (not temporary) %}\n {{ get_assert_columns_equivalent(sql) }}\n {{ get_table_columns_and_constraints() }}\n {%- set sql = get_select_subquery(sql) %}\n {% endif %}\n as (\n {{ sql }}\n );\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent", "macro.dbt.get_table_columns_and_constraints", "macro.dbt.get_select_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.026002, "supported_languages": null}, "macro.dbt.default__get_column_names": {"name": "default__get_column_names", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_column_names", "macro_sql": "{% macro default__get_column_names() %}\n {#- loop through user_provided_columns to get column names -#}\n {%- set user_provided_columns = model['columns'] -%}\n {%- for i in user_provided_columns %}\n {%- set col = user_provided_columns[i] -%}\n {%- set col_name = adapter.quote(col['name']) if col.get('quote') else col['name'] -%}\n {{ col_name }}{{ \", \" if not loop.last }}\n {%- endfor -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.026505, "supported_languages": null}, "macro.dbt.get_select_subquery": {"name": "get_select_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.get_select_subquery", "macro_sql": "{% macro get_select_subquery(sql) %}\n {{ return(adapter.dispatch('get_select_subquery', 'dbt')(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_select_subquery"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.026709, "supported_languages": null}, "macro.dbt.default__get_select_subquery": {"name": "default__get_select_subquery", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/table/create_table_as.sql", "original_file_path": "macros/materializations/models/table/create_table_as.sql", "unique_id": "macro.dbt.default__get_select_subquery", "macro_sql": "{% macro default__get_select_subquery(sql) %}\n select {{ adapter.dispatch('get_column_names', 'dbt')() }}\n from (\n {{ sql }}\n ) as model_subq\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.get_column_names", "macro.dbt.default__get_column_names"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.026901, "supported_languages": null}, "macro.dbt.materialization_view_default": {"name": "materialization_view_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/view.sql", "original_file_path": "macros/materializations/models/view/view.sql", "unique_id": "macro.dbt.materialization_view_default", "macro_sql": "{%- materialization view, default -%}\n\n {%- set existing_relation = load_cached_relation(this) -%}\n {%- set target_relation = this.incorporate(type='view') -%}\n {%- set intermediate_relation = make_intermediate_relation(target_relation) -%}\n\n -- the intermediate_relation should not already exist in the database; get_relation\n -- will return None in that case. Otherwise, we get a relation that we can drop\n -- later, before we try to use this name for the current operation\n {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%}\n /*\n This relation (probably) doesn't exist yet. If it does exist, it's a leftover from\n a previous run, and we're going to try to drop it immediately. At the end of this\n materialization, we're going to rename the \"existing_relation\" to this identifier,\n and then we're going to drop it. In order to make sure we run the correct one of:\n - drop view ...\n - drop table ...\n\n We need to set the type of this relation to be the type of the existing_relation, if it exists,\n or else \"view\" as a sane default if it does not. Note that if the existing_relation does not\n exist, then there is nothing to move out of the way and subsequentally drop. In that case,\n this relation will be effectively unused.\n */\n {%- set backup_relation_type = 'view' if existing_relation is none else existing_relation.type -%}\n {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%}\n -- as above, the backup_relation should not already exist\n {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%}\n -- grab current tables grants config for comparision later on\n {% set grant_config = config.get('grants') %}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- drop the temp relations if they exist already in the database\n {{ drop_relation_if_exists(preexisting_intermediate_relation) }}\n {{ drop_relation_if_exists(preexisting_backup_relation) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_view_as_sql(intermediate_relation, sql) }}\n {%- endcall %}\n\n -- cleanup\n -- move the existing view out of the way\n {% if existing_relation is not none %}\n /* Do the equivalent of rename_if_exists. 'existing_relation' could have been dropped\n since the variable was first set. */\n {% set existing_relation = load_cached_relation(existing_relation) %}\n {% if existing_relation is not none %}\n {{ adapter.rename_relation(existing_relation, backup_relation) }}\n {% endif %}\n {% endif %}\n {{ adapter.rename_relation(intermediate_relation, target_relation) }}\n\n {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n {{ adapter.commit() }}\n\n {{ drop_relation_if_exists(backup_relation) }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{%- endmaterialization -%}", "depends_on": {"macros": ["macro.dbt.load_cached_relation", "macro.dbt.make_intermediate_relation", "macro.dbt.make_backup_relation", "macro.dbt.run_hooks", "macro.dbt.drop_relation_if_exists", "macro.dbt.statement", "macro.dbt.get_create_view_as_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.030029, "supported_languages": ["sql"]}, "macro.dbt.handle_existing_table": {"name": "handle_existing_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/helpers.sql", "original_file_path": "macros/materializations/models/view/helpers.sql", "unique_id": "macro.dbt.handle_existing_table", "macro_sql": "{% macro handle_existing_table(full_refresh, old_relation) %}\n {{ adapter.dispatch('handle_existing_table', 'dbt')(full_refresh, old_relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__handle_existing_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0303628, "supported_languages": null}, "macro.dbt.default__handle_existing_table": {"name": "default__handle_existing_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/helpers.sql", "original_file_path": "macros/materializations/models/view/helpers.sql", "unique_id": "macro.dbt.default__handle_existing_table", "macro_sql": "{% macro default__handle_existing_table(full_refresh, old_relation) %}\n {{ log(\"Dropping relation \" ~ old_relation ~ \" because it is of type \" ~ old_relation.type) }}\n {{ adapter.drop_relation(old_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.030605, "supported_languages": null}, "macro.dbt.create_or_replace_view": {"name": "create_or_replace_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_or_replace_view.sql", "original_file_path": "macros/materializations/models/view/create_or_replace_view.sql", "unique_id": "macro.dbt.create_or_replace_view", "macro_sql": "{% macro create_or_replace_view() %}\n {%- set identifier = model['alias'] -%}\n\n {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}\n {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%}\n\n {%- set target_relation = api.Relation.create(\n identifier=identifier, schema=schema, database=database,\n type='view') -%}\n {% set grant_config = config.get('grants') %}\n\n {{ run_hooks(pre_hooks) }}\n\n -- If there's a table with the same name and we weren't told to full refresh,\n -- that's an error. If we were told to full refresh, drop it. This behavior differs\n -- for Snowflake and BigQuery, so multiple dispatch is used.\n {%- if old_relation is not none and old_relation.is_table -%}\n {{ handle_existing_table(should_full_refresh(), old_relation) }}\n {%- endif -%}\n\n -- build model\n {% call statement('main') -%}\n {{ get_create_view_as_sql(target_relation, sql) }}\n {%- endcall %}\n\n {% set should_revoke = should_revoke(exists_as_view, full_refresh_mode=True) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {{ run_hooks(post_hooks) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_hooks", "macro.dbt.handle_existing_table", "macro.dbt.should_full_refresh", "macro.dbt.statement", "macro.dbt.get_create_view_as_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.032236, "supported_languages": null}, "macro.dbt.get_create_view_as_sql": {"name": "get_create_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.get_create_view_as_sql", "macro_sql": "{% macro get_create_view_as_sql(relation, sql) -%}\n {{ adapter.dispatch('get_create_view_as_sql', 'dbt')(relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_view_as_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.032695, "supported_languages": null}, "macro.dbt.default__get_create_view_as_sql": {"name": "default__get_create_view_as_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.default__get_create_view_as_sql", "macro_sql": "{% macro default__get_create_view_as_sql(relation, sql) -%}\n {{ return(create_view_as(relation, sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_view_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.032871, "supported_languages": null}, "macro.dbt.create_view_as": {"name": "create_view_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.create_view_as", "macro_sql": "{% macro create_view_as(relation, sql) -%}\n {{ adapter.dispatch('create_view_as', 'dbt')(relation, sql) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_view_as"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.033067, "supported_languages": null}, "macro.dbt.default__create_view_as": {"name": "default__create_view_as", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/models/view/create_view_as.sql", "original_file_path": "macros/materializations/models/view/create_view_as.sql", "unique_id": "macro.dbt.default__create_view_as", "macro_sql": "{% macro default__create_view_as(relation, sql) -%}\n {%- set sql_header = config.get('sql_header', none) -%}\n\n {{ sql_header if sql_header is not none }}\n create view {{ relation }}\n {% set contract_config = config.get('contract') %}\n {% if contract_config.enforced %}\n {{ get_assert_columns_equivalent(sql) }}\n {%- endif %}\n as (\n {{ sql }}\n );\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.get_assert_columns_equivalent"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.033529, "supported_languages": null}, "macro.dbt.materialization_seed_default": {"name": "materialization_seed_default", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/seed.sql", "original_file_path": "macros/materializations/seeds/seed.sql", "unique_id": "macro.dbt.materialization_seed_default", "macro_sql": "{% materialization seed, default %}\n\n {%- set identifier = model['alias'] -%}\n {%- set full_refresh_mode = (should_full_refresh()) -%}\n\n {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}\n\n {%- set exists_as_table = (old_relation is not none and old_relation.is_table) -%}\n {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%}\n\n {%- set grant_config = config.get('grants') -%}\n {%- set agate_table = load_agate_table() -%}\n -- grab current tables grants config for comparison later on\n\n {%- do store_result('agate_table', response='OK', agate_table=agate_table) -%}\n\n {{ run_hooks(pre_hooks, inside_transaction=False) }}\n\n -- `BEGIN` happens here:\n {{ run_hooks(pre_hooks, inside_transaction=True) }}\n\n -- build model\n {% set create_table_sql = \"\" %}\n {% if exists_as_view %}\n {{ exceptions.raise_compiler_error(\"Cannot seed to '{}', it is a view\".format(old_relation)) }}\n {% elif exists_as_table %}\n {% set create_table_sql = reset_csv_table(model, full_refresh_mode, old_relation, agate_table) %}\n {% else %}\n {% set create_table_sql = create_csv_table(model, agate_table) %}\n {% endif %}\n\n {% set code = 'CREATE' if full_refresh_mode else 'INSERT' %}\n {% set rows_affected = (agate_table.rows | length) %}\n {% set sql = load_csv_rows(model, agate_table) %}\n\n {% call noop_statement('main', code ~ ' ' ~ rows_affected, code, rows_affected) %}\n {{ get_csv_sql(create_table_sql, sql) }};\n {% endcall %}\n\n {% set target_relation = this.incorporate(type='table') %}\n\n {% set should_revoke = should_revoke(old_relation, full_refresh_mode) %}\n {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %}\n\n {% do persist_docs(target_relation, model) %}\n\n {% if full_refresh_mode or not exists_as_table %}\n {% do create_indexes(target_relation) %}\n {% endif %}\n\n {{ run_hooks(post_hooks, inside_transaction=True) }}\n\n -- `COMMIT` happens here\n {{ adapter.commit() }}\n\n {{ run_hooks(post_hooks, inside_transaction=False) }}\n\n {{ return({'relations': [target_relation]}) }}\n\n{% endmaterialization %}", "depends_on": {"macros": ["macro.dbt.should_full_refresh", "macro.dbt.run_hooks", "macro.dbt.reset_csv_table", "macro.dbt.create_csv_table", "macro.dbt.load_csv_rows", "macro.dbt.noop_statement", "macro.dbt.get_csv_sql", "macro.dbt.should_revoke", "macro.dbt.apply_grants", "macro.dbt.persist_docs", "macro.dbt.create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.037148, "supported_languages": ["sql"]}, "macro.dbt.create_csv_table": {"name": "create_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.create_csv_table", "macro_sql": "{% macro create_csv_table(model, agate_table) -%}\n {{ adapter.dispatch('create_csv_table', 'dbt')(model, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.08895, "supported_languages": null}, "macro.dbt.default__create_csv_table": {"name": "default__create_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__create_csv_table", "macro_sql": "{% macro default__create_csv_table(model, agate_table) %}\n {%- set column_override = model['config'].get('column_types', {}) -%}\n {%- set quote_seed_column = model['config'].get('quote_columns', None) -%}\n\n {% set sql %}\n create table {{ this.render() }} (\n {%- for col_name in agate_table.column_names -%}\n {%- set inferred_type = adapter.convert_type(agate_table, loop.index0) -%}\n {%- set type = column_override.get(col_name, inferred_type) -%}\n {%- set column_name = (col_name | string) -%}\n {{ adapter.quote_seed_column(column_name, quote_seed_column) }} {{ type }} {%- if not loop.last -%}, {%- endif -%}\n {%- endfor -%}\n )\n {% endset %}\n\n {% call statement('_') -%}\n {{ sql }}\n {%- endcall %}\n\n {{ return(sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.089969, "supported_languages": null}, "macro.dbt.reset_csv_table": {"name": "reset_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.reset_csv_table", "macro_sql": "{% macro reset_csv_table(model, full_refresh, old_relation, agate_table) -%}\n {{ adapter.dispatch('reset_csv_table', 'dbt')(model, full_refresh, old_relation, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__reset_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.090234, "supported_languages": null}, "macro.dbt.default__reset_csv_table": {"name": "default__reset_csv_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__reset_csv_table", "macro_sql": "{% macro default__reset_csv_table(model, full_refresh, old_relation, agate_table) %}\n {% set sql = \"\" %}\n {% if full_refresh %}\n {{ adapter.drop_relation(old_relation) }}\n {% set sql = create_csv_table(model, agate_table) %}\n {% else %}\n {{ adapter.truncate_relation(old_relation) }}\n {% set sql = \"truncate table \" ~ old_relation %}\n {% endif %}\n\n {{ return(sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.create_csv_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.090765, "supported_languages": null}, "macro.dbt.get_csv_sql": {"name": "get_csv_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_csv_sql", "macro_sql": "{% macro get_csv_sql(create_or_truncate_sql, insert_sql) %}\n {{ adapter.dispatch('get_csv_sql', 'dbt')(create_or_truncate_sql, insert_sql) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_csv_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.090977, "supported_languages": null}, "macro.dbt.default__get_csv_sql": {"name": "default__get_csv_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_csv_sql", "macro_sql": "{% macro default__get_csv_sql(create_or_truncate_sql, insert_sql) %}\n {{ create_or_truncate_sql }};\n -- dbt seed --\n {{ insert_sql }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.091125, "supported_languages": null}, "macro.dbt.get_binding_char": {"name": "get_binding_char", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_binding_char", "macro_sql": "{% macro get_binding_char() -%}\n {{ adapter.dispatch('get_binding_char', 'dbt')() }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.091278, "supported_languages": null}, "macro.dbt.default__get_binding_char": {"name": "default__get_binding_char", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_binding_char", "macro_sql": "{% macro default__get_binding_char() %}\n {{ return('%s') }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.091407, "supported_languages": null}, "macro.dbt.get_batch_size": {"name": "get_batch_size", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_batch_size", "macro_sql": "{% macro get_batch_size() -%}\n {{ return(adapter.dispatch('get_batch_size', 'dbt')()) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_batch_size"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.09158, "supported_languages": null}, "macro.dbt.default__get_batch_size": {"name": "default__get_batch_size", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__get_batch_size", "macro_sql": "{% macro default__get_batch_size() %}\n {{ return(10000) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0917099, "supported_languages": null}, "macro.dbt.get_seed_column_quoted_csv": {"name": "get_seed_column_quoted_csv", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.get_seed_column_quoted_csv", "macro_sql": "{% macro get_seed_column_quoted_csv(model, column_names) %}\n {%- set quote_seed_column = model['config'].get('quote_columns', None) -%}\n {% set quoted = [] %}\n {% for col in column_names -%}\n {%- do quoted.append(adapter.quote_seed_column(col, quote_seed_column)) -%}\n {%- endfor %}\n\n {%- set dest_cols_csv = quoted | join(', ') -%}\n {{ return(dest_cols_csv) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0922291, "supported_languages": null}, "macro.dbt.load_csv_rows": {"name": "load_csv_rows", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.load_csv_rows", "macro_sql": "{% macro load_csv_rows(model, agate_table) -%}\n {{ adapter.dispatch('load_csv_rows', 'dbt')(model, agate_table) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__load_csv_rows"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.09243, "supported_languages": null}, "macro.dbt.default__load_csv_rows": {"name": "default__load_csv_rows", "resource_type": "macro", "package_name": "dbt", "path": "macros/materializations/seeds/helpers.sql", "original_file_path": "macros/materializations/seeds/helpers.sql", "unique_id": "macro.dbt.default__load_csv_rows", "macro_sql": "{% macro default__load_csv_rows(model, agate_table) %}\n\n {% set batch_size = get_batch_size() %}\n\n {% set cols_sql = get_seed_column_quoted_csv(model, agate_table.column_names) %}\n {% set bindings = [] %}\n\n {% set statements = [] %}\n\n {% for chunk in agate_table.rows | batch(batch_size) %}\n {% set bindings = [] %}\n\n {% for row in chunk %}\n {% do bindings.extend(row) %}\n {% endfor %}\n\n {% set sql %}\n insert into {{ this.render() }} ({{ cols_sql }}) values\n {% for row in chunk -%}\n ({%- for column in agate_table.column_names -%}\n {{ get_binding_char() }}\n {%- if not loop.last%},{%- endif %}\n {%- endfor -%})\n {%- if not loop.last%},{%- endif %}\n {%- endfor %}\n {% endset %}\n\n {% do adapter.add_query(sql, bindings=bindings, abridge_sql_log=True) %}\n\n {% if loop.index0 == 0 %}\n {% do statements.append(sql) %}\n {% endif %}\n {% endfor %}\n\n {# Return SQL so we can render it out into the compiled files #}\n {{ return(statements[0]) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_batch_size", "macro.dbt.get_seed_column_quoted_csv", "macro.dbt.get_binding_char"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.093858, "supported_languages": null}, "macro.dbt.generate_alias_name": {"name": "generate_alias_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_alias.sql", "original_file_path": "macros/get_custom_name/get_custom_alias.sql", "unique_id": "macro.dbt.generate_alias_name", "macro_sql": "{% macro generate_alias_name(custom_alias_name=none, node=none) -%}\n {% do return(adapter.dispatch('generate_alias_name', 'dbt')(custom_alias_name, node)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_alias_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.094365, "supported_languages": null}, "macro.dbt.default__generate_alias_name": {"name": "default__generate_alias_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_alias.sql", "original_file_path": "macros/get_custom_name/get_custom_alias.sql", "unique_id": "macro.dbt.default__generate_alias_name", "macro_sql": "{% macro default__generate_alias_name(custom_alias_name=none, node=none) -%}\n\n {%- if custom_alias_name -%}\n\n {{ custom_alias_name | trim }}\n\n {%- elif node.version -%}\n\n {{ return(node.name ~ \"_v\" ~ (node.version | replace(\".\", \"_\"))) }}\n\n {%- else -%}\n\n {{ node.name }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0947871, "supported_languages": null}, "macro.dbt.generate_schema_name": {"name": "generate_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.generate_schema_name", "macro_sql": "{% macro generate_schema_name(custom_schema_name=none, node=none) -%}\n {{ return(adapter.dispatch('generate_schema_name', 'dbt')(custom_schema_name, node)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_schema_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.095365, "supported_languages": null}, "macro.dbt.default__generate_schema_name": {"name": "default__generate_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.default__generate_schema_name", "macro_sql": "{% macro default__generate_schema_name(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if custom_schema_name is none -%}\n\n {{ default_schema }}\n\n {%- else -%}\n\n {{ default_schema }}_{{ custom_schema_name | trim }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.095656, "supported_languages": null}, "macro.dbt.generate_schema_name_for_env": {"name": "generate_schema_name_for_env", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_schema.sql", "original_file_path": "macros/get_custom_name/get_custom_schema.sql", "unique_id": "macro.dbt.generate_schema_name_for_env", "macro_sql": "{% macro generate_schema_name_for_env(custom_schema_name, node) -%}\n\n {%- set default_schema = target.schema -%}\n {%- if target.name == 'prod' and custom_schema_name is not none -%}\n\n {{ custom_schema_name | trim }}\n\n {%- else -%}\n\n {{ default_schema }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.095977, "supported_languages": null}, "macro.dbt.generate_database_name": {"name": "generate_database_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_database.sql", "original_file_path": "macros/get_custom_name/get_custom_database.sql", "unique_id": "macro.dbt.generate_database_name", "macro_sql": "{% macro generate_database_name(custom_database_name=none, node=none) -%}\n {% do return(adapter.dispatch('generate_database_name', 'dbt')(custom_database_name, node)) %}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__generate_database_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.096415, "supported_languages": null}, "macro.dbt.default__generate_database_name": {"name": "default__generate_database_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/get_custom_name/get_custom_database.sql", "original_file_path": "macros/get_custom_name/get_custom_database.sql", "unique_id": "macro.dbt.default__generate_database_name", "macro_sql": "{% macro default__generate_database_name(custom_database_name=none, node=none) -%}\n {%- set default_database = target.database -%}\n {%- if custom_database_name is none -%}\n\n {{ default_database }}\n\n {%- else -%}\n\n {{ custom_database_name }}\n\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.096698, "supported_languages": null}, "macro.dbt.default__test_relationships": {"name": "default__test_relationships", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/relationships.sql", "original_file_path": "macros/generic_test_sql/relationships.sql", "unique_id": "macro.dbt.default__test_relationships", "macro_sql": "{% macro default__test_relationships(model, column_name, to, field) %}\n\nwith child as (\n select {{ column_name }} as from_field\n from {{ model }}\n where {{ column_name }} is not null\n),\n\nparent as (\n select {{ field }} as to_field\n from {{ to }}\n)\n\nselect\n from_field\n\nfrom child\nleft join parent\n on child.from_field = parent.to_field\n\nwhere parent.to_field is null\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0970821, "supported_languages": null}, "macro.dbt.default__test_not_null": {"name": "default__test_not_null", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/not_null.sql", "original_file_path": "macros/generic_test_sql/not_null.sql", "unique_id": "macro.dbt.default__test_not_null", "macro_sql": "{% macro default__test_not_null(model, column_name) %}\n\n{% set column_list = '*' if should_store_failures() else column_name %}\n\nselect {{ column_list }}\nfrom {{ model }}\nwhere {{ column_name }} is null\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.should_store_failures"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.0974112, "supported_languages": null}, "macro.dbt.default__test_unique": {"name": "default__test_unique", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/unique.sql", "original_file_path": "macros/generic_test_sql/unique.sql", "unique_id": "macro.dbt.default__test_unique", "macro_sql": "{% macro default__test_unique(model, column_name) %}\n\nselect\n {{ column_name }} as unique_field,\n count(*) as n_records\n\nfrom {{ model }}\nwhere {{ column_name }} is not null\ngroup by {{ column_name }}\nhaving count(*) > 1\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.097684, "supported_languages": null}, "macro.dbt.default__test_accepted_values": {"name": "default__test_accepted_values", "resource_type": "macro", "package_name": "dbt", "path": "macros/generic_test_sql/accepted_values.sql", "original_file_path": "macros/generic_test_sql/accepted_values.sql", "unique_id": "macro.dbt.default__test_accepted_values", "macro_sql": "{% macro default__test_accepted_values(model, column_name, values, quote=True) %}\n\nwith all_values as (\n\n select\n {{ column_name }} as value_field,\n count(*) as n_records\n\n from {{ model }}\n group by {{ column_name }}\n\n)\n\nselect *\nfrom all_values\nwhere value_field not in (\n {% for value in values -%}\n {% if quote -%}\n '{{ value }}'\n {%- else -%}\n {{ value }}\n {%- endif -%}\n {%- if not loop.last -%},{%- endif %}\n {%- endfor %}\n)\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.098306, "supported_languages": null}, "macro.dbt.statement": {"name": "statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.statement", "macro_sql": "\n{%- macro statement(name=None, fetch_result=False, auto_begin=True, language='sql') -%}\n {%- if execute: -%}\n {%- set compiled_code = caller() -%}\n\n {%- if name == 'main' -%}\n {{ log('Writing runtime {} for node \"{}\"'.format(language, model['unique_id'])) }}\n {{ write(compiled_code) }}\n {%- endif -%}\n {%- if language == 'sql'-%}\n {%- set res, table = adapter.execute(compiled_code, auto_begin=auto_begin, fetch=fetch_result) -%}\n {%- elif language == 'python' -%}\n {%- set res = submit_python_job(model, compiled_code) -%}\n {#-- TODO: What should table be for python models? --#}\n {%- set table = None -%}\n {%- else -%}\n {% do exceptions.raise_compiler_error(\"statement macro didn't get supported language\") %}\n {%- endif -%}\n\n {%- if name is not none -%}\n {{ store_result(name, response=res, agate_table=table) }}\n {%- endif -%}\n\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.099923, "supported_languages": null}, "macro.dbt.noop_statement": {"name": "noop_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.noop_statement", "macro_sql": "{% macro noop_statement(name=None, message=None, code=None, rows_affected=None, res=None) -%}\n {%- set sql = caller() -%}\n\n {%- if name == 'main' -%}\n {{ log('Writing runtime SQL for node \"{}\"'.format(model['unique_id'])) }}\n {{ write(sql) }}\n {%- endif -%}\n\n {%- if name is not none -%}\n {{ store_raw_result(name, message=message, code=code, rows_affected=rows_affected, agate_table=res) }}\n {%- endif -%}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.100641, "supported_languages": null}, "macro.dbt.run_query": {"name": "run_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/statement.sql", "original_file_path": "macros/etc/statement.sql", "unique_id": "macro.dbt.run_query", "macro_sql": "{% macro run_query(sql) %}\n {% call statement(\"run_query_statement\", fetch_result=true, auto_begin=false) %}\n {{ sql }}\n {% endcall %}\n\n {% do return(load_result(\"run_query_statement\").table) %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.100966, "supported_languages": null}, "macro.dbt.convert_datetime": {"name": "convert_datetime", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.convert_datetime", "macro_sql": "{% macro convert_datetime(date_str, date_fmt) %}\n\n {% set error_msg -%}\n The provided partition date '{{ date_str }}' does not match the expected format '{{ date_fmt }}'\n {%- endset %}\n\n {% set res = try_or_compiler_error(error_msg, modules.datetime.datetime.strptime, date_str.strip(), date_fmt) %}\n {{ return(res) }}\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.103025, "supported_languages": null}, "macro.dbt.dates_in_range": {"name": "dates_in_range", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.dates_in_range", "macro_sql": "{% macro dates_in_range(start_date_str, end_date_str=none, in_fmt=\"%Y%m%d\", out_fmt=\"%Y%m%d\") %}\n {% set end_date_str = start_date_str if end_date_str is none else end_date_str %}\n\n {% set start_date = convert_datetime(start_date_str, in_fmt) %}\n {% set end_date = convert_datetime(end_date_str, in_fmt) %}\n\n {% set day_count = (end_date - start_date).days %}\n {% if day_count < 0 %}\n {% set msg -%}\n Partiton start date is after the end date ({{ start_date }}, {{ end_date }})\n {%- endset %}\n\n {{ exceptions.raise_compiler_error(msg, model) }}\n {% endif %}\n\n {% set date_list = [] %}\n {% for i in range(0, day_count + 1) %}\n {% set the_date = (modules.datetime.timedelta(days=i) + start_date) %}\n {% if not out_fmt %}\n {% set _ = date_list.append(the_date) %}\n {% else %}\n {% set _ = date_list.append(the_date.strftime(out_fmt)) %}\n {% endif %}\n {% endfor %}\n\n {{ return(date_list) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.convert_datetime"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1043022, "supported_languages": null}, "macro.dbt.partition_range": {"name": "partition_range", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.partition_range", "macro_sql": "{% macro partition_range(raw_partition_date, date_fmt='%Y%m%d') %}\n {% set partition_range = (raw_partition_date | string).split(\",\") %}\n\n {% if (partition_range | length) == 1 %}\n {% set start_date = partition_range[0] %}\n {% set end_date = none %}\n {% elif (partition_range | length) == 2 %}\n {% set start_date = partition_range[0] %}\n {% set end_date = partition_range[1] %}\n {% else %}\n {{ exceptions.raise_compiler_error(\"Invalid partition time. Expected format: {Start Date}[,{End Date}]. Got: \" ~ raw_partition_date) }}\n {% endif %}\n\n {{ return(dates_in_range(start_date, end_date, in_fmt=date_fmt)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.dates_in_range"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1051252, "supported_languages": null}, "macro.dbt.py_current_timestring": {"name": "py_current_timestring", "resource_type": "macro", "package_name": "dbt", "path": "macros/etc/datetime.sql", "original_file_path": "macros/etc/datetime.sql", "unique_id": "macro.dbt.py_current_timestring", "macro_sql": "{% macro py_current_timestring() %}\n {% set dt = modules.datetime.datetime.now() %}\n {% do return(dt.strftime(\"%Y%m%d%H%M%S%f\")) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1053839, "supported_languages": null}, "macro.dbt.except": {"name": "except", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/except.sql", "original_file_path": "macros/utils/except.sql", "unique_id": "macro.dbt.except", "macro_sql": "{% macro except() %}\n {{ return(adapter.dispatch('except', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__except"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1056361, "supported_languages": null}, "macro.dbt.default__except": {"name": "default__except", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/except.sql", "original_file_path": "macros/utils/except.sql", "unique_id": "macro.dbt.default__except", "macro_sql": "{% macro default__except() %}\n\n except\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.105721, "supported_languages": null}, "macro.dbt.replace": {"name": "replace", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/replace.sql", "original_file_path": "macros/utils/replace.sql", "unique_id": "macro.dbt.replace", "macro_sql": "{% macro replace(field, old_chars, new_chars) -%}\n {{ return(adapter.dispatch('replace', 'dbt') (field, old_chars, new_chars)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__replace"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.10608, "supported_languages": null}, "macro.dbt.default__replace": {"name": "default__replace", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/replace.sql", "original_file_path": "macros/utils/replace.sql", "unique_id": "macro.dbt.default__replace", "macro_sql": "{% macro default__replace(field, old_chars, new_chars) %}\n\n replace(\n {{ field }},\n {{ old_chars }},\n {{ new_chars }}\n )\n\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.106265, "supported_languages": null}, "macro.dbt.concat": {"name": "concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/concat.sql", "original_file_path": "macros/utils/concat.sql", "unique_id": "macro.dbt.concat", "macro_sql": "{% macro concat(fields) -%}\n {{ return(adapter.dispatch('concat', 'dbt')(fields)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__concat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1065311, "supported_languages": null}, "macro.dbt.default__concat": {"name": "default__concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/concat.sql", "original_file_path": "macros/utils/concat.sql", "unique_id": "macro.dbt.default__concat", "macro_sql": "{% macro default__concat(fields) -%}\n {{ fields|join(' || ') }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.106666, "supported_languages": null}, "macro.dbt.length": {"name": "length", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/length.sql", "original_file_path": "macros/utils/length.sql", "unique_id": "macro.dbt.length", "macro_sql": "{% macro length(expression) -%}\n {{ return(adapter.dispatch('length', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__length"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1069381, "supported_languages": null}, "macro.dbt.default__length": {"name": "default__length", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/length.sql", "original_file_path": "macros/utils/length.sql", "unique_id": "macro.dbt.default__length", "macro_sql": "{% macro default__length(expression) %}\n\n length(\n {{ expression }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.107061, "supported_languages": null}, "macro.dbt.dateadd": {"name": "dateadd", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt.dateadd", "macro_sql": "{% macro dateadd(datepart, interval, from_date_or_timestamp) %}\n {{ return(adapter.dispatch('dateadd', 'dbt')(datepart, interval, from_date_or_timestamp)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__dateadd"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.107425, "supported_languages": null}, "macro.dbt.default__dateadd": {"name": "default__dateadd", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/dateadd.sql", "original_file_path": "macros/utils/dateadd.sql", "unique_id": "macro.dbt.default__dateadd", "macro_sql": "{% macro default__dateadd(datepart, interval, from_date_or_timestamp) %}\n\n dateadd(\n {{ datepart }},\n {{ interval }},\n {{ from_date_or_timestamp }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.107605, "supported_languages": null}, "macro.dbt.intersect": {"name": "intersect", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/intersect.sql", "original_file_path": "macros/utils/intersect.sql", "unique_id": "macro.dbt.intersect", "macro_sql": "{% macro intersect() %}\n {{ return(adapter.dispatch('intersect', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__intersect"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.107845, "supported_languages": null}, "macro.dbt.default__intersect": {"name": "default__intersect", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/intersect.sql", "original_file_path": "macros/utils/intersect.sql", "unique_id": "macro.dbt.default__intersect", "macro_sql": "{% macro default__intersect() %}\n\n intersect\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1079319, "supported_languages": null}, "macro.dbt.escape_single_quotes": {"name": "escape_single_quotes", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/escape_single_quotes.sql", "original_file_path": "macros/utils/escape_single_quotes.sql", "unique_id": "macro.dbt.escape_single_quotes", "macro_sql": "{% macro escape_single_quotes(expression) %}\n {{ return(adapter.dispatch('escape_single_quotes', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__escape_single_quotes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.108211, "supported_languages": null}, "macro.dbt.default__escape_single_quotes": {"name": "default__escape_single_quotes", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/escape_single_quotes.sql", "original_file_path": "macros/utils/escape_single_quotes.sql", "unique_id": "macro.dbt.default__escape_single_quotes", "macro_sql": "{% macro default__escape_single_quotes(expression) -%}\n{{ expression | replace(\"'\",\"''\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.108365, "supported_languages": null}, "macro.dbt.right": {"name": "right", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/right.sql", "original_file_path": "macros/utils/right.sql", "unique_id": "macro.dbt.right", "macro_sql": "{% macro right(string_text, length_expression) -%}\n {{ return(adapter.dispatch('right', 'dbt') (string_text, length_expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__right"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.10868, "supported_languages": null}, "macro.dbt.default__right": {"name": "default__right", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/right.sql", "original_file_path": "macros/utils/right.sql", "unique_id": "macro.dbt.default__right", "macro_sql": "{% macro default__right(string_text, length_expression) %}\n\n right(\n {{ string_text }},\n {{ length_expression }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.108904, "supported_languages": null}, "macro.dbt.listagg": {"name": "listagg", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt.listagg", "macro_sql": "{% macro listagg(measure, delimiter_text=\"','\", order_by_clause=none, limit_num=none) -%}\n {{ return(adapter.dispatch('listagg', 'dbt') (measure, delimiter_text, order_by_clause, limit_num)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__listagg"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1292741, "supported_languages": null}, "macro.dbt.default__listagg": {"name": "default__listagg", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/listagg.sql", "original_file_path": "macros/utils/listagg.sql", "unique_id": "macro.dbt.default__listagg", "macro_sql": "{% macro default__listagg(measure, delimiter_text, order_by_clause, limit_num) -%}\n\n {% if limit_num -%}\n array_to_string(\n array_slice(\n array_agg(\n {{ measure }}\n ){% if order_by_clause -%}\n within group ({{ order_by_clause }})\n {%- endif %}\n ,0\n ,{{ limit_num }}\n ),\n {{ delimiter_text }}\n )\n {%- else %}\n listagg(\n {{ measure }},\n {{ delimiter_text }}\n )\n {% if order_by_clause -%}\n within group ({{ order_by_clause }})\n {%- endif %}\n {%- endif %}\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1297522, "supported_languages": null}, "macro.dbt.datediff": {"name": "datediff", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt.datediff", "macro_sql": "{% macro datediff(first_date, second_date, datepart) %}\n {{ return(adapter.dispatch('datediff', 'dbt')(first_date, second_date, datepart)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__datediff"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.130143, "supported_languages": null}, "macro.dbt.default__datediff": {"name": "default__datediff", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/datediff.sql", "original_file_path": "macros/utils/datediff.sql", "unique_id": "macro.dbt.default__datediff", "macro_sql": "{% macro default__datediff(first_date, second_date, datepart) -%}\n\n datediff(\n {{ datepart }},\n {{ first_date }},\n {{ second_date }}\n )\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1303282, "supported_languages": null}, "macro.dbt.safe_cast": {"name": "safe_cast", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/safe_cast.sql", "original_file_path": "macros/utils/safe_cast.sql", "unique_id": "macro.dbt.safe_cast", "macro_sql": "{% macro safe_cast(field, type) %}\n {{ return(adapter.dispatch('safe_cast', 'dbt') (field, type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__safe_cast"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.130649, "supported_languages": null}, "macro.dbt.default__safe_cast": {"name": "default__safe_cast", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/safe_cast.sql", "original_file_path": "macros/utils/safe_cast.sql", "unique_id": "macro.dbt.default__safe_cast", "macro_sql": "{% macro default__safe_cast(field, type) %}\n {# most databases don't support this function yet\n so we just need to use cast #}\n cast({{field}} as {{type}})\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.130805, "supported_languages": null}, "macro.dbt.hash": {"name": "hash", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/hash.sql", "original_file_path": "macros/utils/hash.sql", "unique_id": "macro.dbt.hash", "macro_sql": "{% macro hash(field) -%}\n {{ return(adapter.dispatch('hash', 'dbt') (field)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__hash"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.13109, "supported_languages": null}, "macro.dbt.default__hash": {"name": "default__hash", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/hash.sql", "original_file_path": "macros/utils/hash.sql", "unique_id": "macro.dbt.default__hash", "macro_sql": "{% macro default__hash(field) -%}\n md5(cast({{ field }} as {{ api.Column.translate_type('string') }}))\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1313999, "supported_languages": null}, "macro.dbt.cast_bool_to_text": {"name": "cast_bool_to_text", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/cast_bool_to_text.sql", "original_file_path": "macros/utils/cast_bool_to_text.sql", "unique_id": "macro.dbt.cast_bool_to_text", "macro_sql": "{% macro cast_bool_to_text(field) %}\n {{ adapter.dispatch('cast_bool_to_text', 'dbt') (field) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__cast_bool_to_text"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.131728, "supported_languages": null}, "macro.dbt.default__cast_bool_to_text": {"name": "default__cast_bool_to_text", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/cast_bool_to_text.sql", "original_file_path": "macros/utils/cast_bool_to_text.sql", "unique_id": "macro.dbt.default__cast_bool_to_text", "macro_sql": "{% macro default__cast_bool_to_text(field) %}\n cast({{ field }} as {{ api.Column.translate_type('string') }})\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.131938, "supported_languages": null}, "macro.dbt.any_value": {"name": "any_value", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt.any_value", "macro_sql": "{% macro any_value(expression) -%}\n {{ return(adapter.dispatch('any_value', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__any_value"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.137078, "supported_languages": null}, "macro.dbt.default__any_value": {"name": "default__any_value", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/any_value.sql", "original_file_path": "macros/utils/any_value.sql", "unique_id": "macro.dbt.default__any_value", "macro_sql": "{% macro default__any_value(expression) -%}\n\n any_value({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.13744, "supported_languages": null}, "macro.dbt.position": {"name": "position", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/position.sql", "original_file_path": "macros/utils/position.sql", "unique_id": "macro.dbt.position", "macro_sql": "{% macro position(substring_text, string_text) -%}\n {{ return(adapter.dispatch('position', 'dbt') (substring_text, string_text)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__position"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.138792, "supported_languages": null}, "macro.dbt.default__position": {"name": "default__position", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/position.sql", "original_file_path": "macros/utils/position.sql", "unique_id": "macro.dbt.default__position", "macro_sql": "{% macro default__position(substring_text, string_text) %}\n\n position(\n {{ substring_text }} in {{ string_text }}\n )\n\n{%- endmacro -%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.138962, "supported_languages": null}, "macro.dbt.string_literal": {"name": "string_literal", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/literal.sql", "original_file_path": "macros/utils/literal.sql", "unique_id": "macro.dbt.string_literal", "macro_sql": "{%- macro string_literal(value) -%}\n {{ return(adapter.dispatch('string_literal', 'dbt') (value)) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__string_literal"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.139265, "supported_languages": null}, "macro.dbt.default__string_literal": {"name": "default__string_literal", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/literal.sql", "original_file_path": "macros/utils/literal.sql", "unique_id": "macro.dbt.default__string_literal", "macro_sql": "{% macro default__string_literal(value) -%}\n '{{ value }}'\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.139394, "supported_languages": null}, "macro.dbt.type_string": {"name": "type_string", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_string", "macro_sql": "\n\n{%- macro type_string() -%}\n {{ return(adapter.dispatch('type_string', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_string"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1404212, "supported_languages": null}, "macro.dbt.default__type_string": {"name": "default__type_string", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_string", "macro_sql": "{% macro default__type_string() %}\n {{ return(api.Column.translate_type(\"string\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1408, "supported_languages": null}, "macro.dbt.type_timestamp": {"name": "type_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_timestamp", "macro_sql": "\n\n{%- macro type_timestamp() -%}\n {{ return(adapter.dispatch('type_timestamp', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1411462, "supported_languages": null}, "macro.dbt.default__type_timestamp": {"name": "default__type_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_timestamp", "macro_sql": "{% macro default__type_timestamp() %}\n {{ return(api.Column.translate_type(\"timestamp\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.141341, "supported_languages": null}, "macro.dbt.type_float": {"name": "type_float", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_float", "macro_sql": "\n\n{%- macro type_float() -%}\n {{ return(adapter.dispatch('type_float', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_float"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1415339, "supported_languages": null}, "macro.dbt.default__type_float": {"name": "default__type_float", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_float", "macro_sql": "{% macro default__type_float() %}\n {{ return(api.Column.translate_type(\"float\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1417038, "supported_languages": null}, "macro.dbt.type_numeric": {"name": "type_numeric", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_numeric", "macro_sql": "\n\n{%- macro type_numeric() -%}\n {{ return(adapter.dispatch('type_numeric', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_numeric"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.141888, "supported_languages": null}, "macro.dbt.default__type_numeric": {"name": "default__type_numeric", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_numeric", "macro_sql": "{% macro default__type_numeric() %}\n {{ return(api.Column.numeric_type(\"numeric\", 28, 6)) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1420841, "supported_languages": null}, "macro.dbt.type_bigint": {"name": "type_bigint", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_bigint", "macro_sql": "\n\n{%- macro type_bigint() -%}\n {{ return(adapter.dispatch('type_bigint', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_bigint"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.14226, "supported_languages": null}, "macro.dbt.default__type_bigint": {"name": "default__type_bigint", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_bigint", "macro_sql": "{% macro default__type_bigint() %}\n {{ return(api.Column.translate_type(\"bigint\")) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1424289, "supported_languages": null}, "macro.dbt.type_int": {"name": "type_int", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_int", "macro_sql": "\n\n{%- macro type_int() -%}\n {{ return(adapter.dispatch('type_int', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_int"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.142606, "supported_languages": null}, "macro.dbt.default__type_int": {"name": "default__type_int", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_int", "macro_sql": "{%- macro default__type_int() -%}\n {{ return(api.Column.translate_type(\"integer\")) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.142765, "supported_languages": null}, "macro.dbt.type_boolean": {"name": "type_boolean", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.type_boolean", "macro_sql": "\n\n{%- macro type_boolean() -%}\n {{ return(adapter.dispatch('type_boolean', 'dbt')()) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.default__type_boolean"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1429381, "supported_languages": null}, "macro.dbt.default__type_boolean": {"name": "default__type_boolean", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/data_types.sql", "original_file_path": "macros/utils/data_types.sql", "unique_id": "macro.dbt.default__type_boolean", "macro_sql": "{%- macro default__type_boolean() -%}\n {{ return(api.Column.translate_type(\"boolean\")) }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.143098, "supported_languages": null}, "macro.dbt.array_concat": {"name": "array_concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_concat.sql", "original_file_path": "macros/utils/array_concat.sql", "unique_id": "macro.dbt.array_concat", "macro_sql": "{% macro array_concat(array_1, array_2) -%}\n {{ return(adapter.dispatch('array_concat', 'dbt')(array_1, array_2)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_concat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.143426, "supported_languages": null}, "macro.dbt.default__array_concat": {"name": "default__array_concat", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_concat.sql", "original_file_path": "macros/utils/array_concat.sql", "unique_id": "macro.dbt.default__array_concat", "macro_sql": "{% macro default__array_concat(array_1, array_2) -%}\n array_cat({{ array_1 }}, {{ array_2 }})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.14357, "supported_languages": null}, "macro.dbt.bool_or": {"name": "bool_or", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/bool_or.sql", "original_file_path": "macros/utils/bool_or.sql", "unique_id": "macro.dbt.bool_or", "macro_sql": "{% macro bool_or(expression) -%}\n {{ return(adapter.dispatch('bool_or', 'dbt') (expression)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__bool_or"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.14384, "supported_languages": null}, "macro.dbt.default__bool_or": {"name": "default__bool_or", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/bool_or.sql", "original_file_path": "macros/utils/bool_or.sql", "unique_id": "macro.dbt.default__bool_or", "macro_sql": "{% macro default__bool_or(expression) -%}\n\n bool_or({{ expression }})\n\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.143957, "supported_languages": null}, "macro.dbt.last_day": {"name": "last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.last_day", "macro_sql": "{% macro last_day(date, datepart) %}\n {{ return(adapter.dispatch('last_day', 'dbt') (date, datepart)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.144338, "supported_languages": null}, "macro.dbt.default_last_day": {"name": "default_last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.default_last_day", "macro_sql": "\n\n{%- macro default_last_day(date, datepart) -%}\n cast(\n {{dbt.dateadd('day', '-1',\n dbt.dateadd(datepart, '1', dbt.date_trunc(datepart, date))\n )}}\n as date)\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt.dateadd", "macro.dbt.date_trunc"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1446452, "supported_languages": null}, "macro.dbt.default__last_day": {"name": "default__last_day", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/last_day.sql", "original_file_path": "macros/utils/last_day.sql", "unique_id": "macro.dbt.default__last_day", "macro_sql": "{% macro default__last_day(date, datepart) -%}\n {{dbt.default_last_day(date, datepart)}}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default_last_day"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.144814, "supported_languages": null}, "macro.dbt.split_part": {"name": "split_part", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt.split_part", "macro_sql": "{% macro split_part(string_text, delimiter_text, part_number) %}\n {{ return(adapter.dispatch('split_part', 'dbt') (string_text, delimiter_text, part_number)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__split_part"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1601658, "supported_languages": null}, "macro.dbt.default__split_part": {"name": "default__split_part", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt.default__split_part", "macro_sql": "{% macro default__split_part(string_text, delimiter_text, part_number) %}\n\n split_part(\n {{ string_text }},\n {{ delimiter_text }},\n {{ part_number }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.160498, "supported_languages": null}, "macro.dbt._split_part_negative": {"name": "_split_part_negative", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/split_part.sql", "original_file_path": "macros/utils/split_part.sql", "unique_id": "macro.dbt._split_part_negative", "macro_sql": "{% macro _split_part_negative(string_text, delimiter_text, part_number) %}\n\n split_part(\n {{ string_text }},\n {{ delimiter_text }},\n length({{ string_text }})\n - length(\n replace({{ string_text }}, {{ delimiter_text }}, '')\n ) + 2 + {{ part_number }}\n )\n\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1607502, "supported_languages": null}, "macro.dbt.date_trunc": {"name": "date_trunc", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/date_trunc.sql", "original_file_path": "macros/utils/date_trunc.sql", "unique_id": "macro.dbt.date_trunc", "macro_sql": "{% macro date_trunc(datepart, date) -%}\n {{ return(adapter.dispatch('date_trunc', 'dbt') (datepart, date)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__date_trunc"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.161074, "supported_languages": null}, "macro.dbt.default__date_trunc": {"name": "default__date_trunc", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/date_trunc.sql", "original_file_path": "macros/utils/date_trunc.sql", "unique_id": "macro.dbt.default__date_trunc", "macro_sql": "{% macro default__date_trunc(datepart, date) -%}\n date_trunc('{{datepart}}', {{date}})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1612182, "supported_languages": null}, "macro.dbt.array_construct": {"name": "array_construct", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_construct.sql", "original_file_path": "macros/utils/array_construct.sql", "unique_id": "macro.dbt.array_construct", "macro_sql": "{% macro array_construct(inputs=[], data_type=api.Column.translate_type('integer')) -%}\n {{ return(adapter.dispatch('array_construct', 'dbt')(inputs, data_type)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_construct"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.16163, "supported_languages": null}, "macro.dbt.default__array_construct": {"name": "default__array_construct", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_construct.sql", "original_file_path": "macros/utils/array_construct.sql", "unique_id": "macro.dbt.default__array_construct", "macro_sql": "{% macro default__array_construct(inputs, data_type) -%}\n {% if inputs|length > 0 %}\n array[ {{ inputs|join(' , ') }} ]\n {% else %}\n array[]::{{data_type}}[]\n {% endif %}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1636019, "supported_languages": null}, "macro.dbt.array_append": {"name": "array_append", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_append.sql", "original_file_path": "macros/utils/array_append.sql", "unique_id": "macro.dbt.array_append", "macro_sql": "{% macro array_append(array, new_element) -%}\n {{ return(adapter.dispatch('array_append', 'dbt')(array, new_element)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__array_append"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1646972, "supported_languages": null}, "macro.dbt.default__array_append": {"name": "default__array_append", "resource_type": "macro", "package_name": "dbt", "path": "macros/utils/array_append.sql", "original_file_path": "macros/utils/array_append.sql", "unique_id": "macro.dbt.default__array_append", "macro_sql": "{% macro default__array_append(array, new_element) -%}\n array_append({{ array }}, {{ new_element }})\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.166338, "supported_languages": null}, "macro.dbt.create_schema": {"name": "create_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.create_schema", "macro_sql": "{% macro create_schema(relation) -%}\n {{ adapter.dispatch('create_schema', 'dbt')(relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__create_schema"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.167658, "supported_languages": null}, "macro.dbt.default__create_schema": {"name": "default__create_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.default__create_schema", "macro_sql": "{% macro default__create_schema(relation) -%}\n {%- call statement('create_schema') -%}\n create schema if not exists {{ relation.without_identifier() }}\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.168282, "supported_languages": null}, "macro.dbt.drop_schema": {"name": "drop_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.drop_schema", "macro_sql": "{% macro drop_schema(relation) -%}\n {{ adapter.dispatch('drop_schema', 'dbt')(relation) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__drop_schema"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1688251, "supported_languages": null}, "macro.dbt.default__drop_schema": {"name": "default__drop_schema", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/schema.sql", "original_file_path": "macros/adapters/schema.sql", "unique_id": "macro.dbt.default__drop_schema", "macro_sql": "{% macro default__drop_schema(relation) -%}\n {%- call statement('drop_schema') -%}\n drop schema if exists {{ relation.without_identifier() }} cascade\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1694698, "supported_languages": null}, "macro.dbt.current_timestamp": {"name": "current_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp", "macro_sql": "{%- macro current_timestamp() -%}\n {{ adapter.dispatch('current_timestamp', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1721869, "supported_languages": null}, "macro.dbt.default__current_timestamp": {"name": "default__current_timestamp", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp", "macro_sql": "{% macro default__current_timestamp() -%}\n {{ exceptions.raise_not_implemented(\n 'current_timestamp macro not implemented for adapter ' + adapter.type()) }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.17238, "supported_languages": null}, "macro.dbt.snapshot_get_time": {"name": "snapshot_get_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.snapshot_get_time", "macro_sql": "\n\n{%- macro snapshot_get_time() -%}\n {{ adapter.dispatch('snapshot_get_time', 'dbt')() }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__snapshot_get_time"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.172548, "supported_languages": null}, "macro.dbt.default__snapshot_get_time": {"name": "default__snapshot_get_time", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__snapshot_get_time", "macro_sql": "{% macro default__snapshot_get_time() %}\n {{ current_timestamp() }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.172672, "supported_languages": null}, "macro.dbt.current_timestamp_backcompat": {"name": "current_timestamp_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp_backcompat", "macro_sql": "{% macro current_timestamp_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__current_timestamp_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.172858, "supported_languages": null}, "macro.dbt.default__current_timestamp_backcompat": {"name": "default__current_timestamp_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp_backcompat", "macro_sql": "{% macro default__current_timestamp_backcompat() %}\n current_timestamp::timestamp\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.172946, "supported_languages": null}, "macro.dbt.current_timestamp_in_utc_backcompat": {"name": "current_timestamp_in_utc_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.current_timestamp_in_utc_backcompat", "macro_sql": "{% macro current_timestamp_in_utc_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_in_utc_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__current_timestamp_in_utc_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.197654, "supported_languages": null}, "macro.dbt.default__current_timestamp_in_utc_backcompat": {"name": "default__current_timestamp_in_utc_backcompat", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/timestamps.sql", "original_file_path": "macros/adapters/timestamps.sql", "unique_id": "macro.dbt.default__current_timestamp_in_utc_backcompat", "macro_sql": "{% macro default__current_timestamp_in_utc_backcompat() %}\n {{ return(adapter.dispatch('current_timestamp_backcompat', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.current_timestamp_backcompat", "macro.dbt.default__current_timestamp_backcompat"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.1980531, "supported_languages": null}, "macro.dbt.get_create_index_sql": {"name": "get_create_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_create_index_sql", "macro_sql": "{% macro get_create_index_sql(relation, index_dict) -%}\n {{ return(adapter.dispatch('get_create_index_sql', 'dbt')(relation, index_dict)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_create_index_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2038138, "supported_languages": null}, "macro.dbt.default__get_create_index_sql": {"name": "default__get_create_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_create_index_sql", "macro_sql": "{% macro default__get_create_index_sql(relation, index_dict) -%}\n {% do return(None) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.20399, "supported_languages": null}, "macro.dbt.create_indexes": {"name": "create_indexes", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.create_indexes", "macro_sql": "{% macro create_indexes(relation) -%}\n {{ adapter.dispatch('create_indexes', 'dbt')(relation) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__create_indexes"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.204183, "supported_languages": null}, "macro.dbt.default__create_indexes": {"name": "default__create_indexes", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__create_indexes", "macro_sql": "{% macro default__create_indexes(relation) -%}\n {%- set _indexes = config.get('indexes', default=[]) -%}\n\n {% for _index_dict in _indexes %}\n {% set create_index_sql = get_create_index_sql(relation, _index_dict) %}\n {% if create_index_sql %}\n {% do run_query(create_index_sql) %}\n {% endif %}\n {% endfor %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_create_index_sql", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.204635, "supported_languages": null}, "macro.dbt.get_drop_index_sql": {"name": "get_drop_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_drop_index_sql", "macro_sql": "{% macro get_drop_index_sql(relation, index_name) -%}\n {{ adapter.dispatch('get_drop_index_sql', 'dbt')(relation, index_name) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_drop_index_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.204842, "supported_languages": null}, "macro.dbt.default__get_drop_index_sql": {"name": "default__get_drop_index_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_drop_index_sql", "macro_sql": "{% macro default__get_drop_index_sql(relation, index_name) -%}\n {{ exceptions.raise_compiler_error(\"`get_drop_index_sql has not been implemented for this adapter.\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2049952, "supported_languages": null}, "macro.dbt.get_show_indexes_sql": {"name": "get_show_indexes_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.get_show_indexes_sql", "macro_sql": "{% macro get_show_indexes_sql(relation) -%}\n {{ adapter.dispatch('get_show_indexes_sql', 'dbt')(relation) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_show_indexes_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.205166, "supported_languages": null}, "macro.dbt.default__get_show_indexes_sql": {"name": "default__get_show_indexes_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/indexes.sql", "original_file_path": "macros/adapters/indexes.sql", "unique_id": "macro.dbt.default__get_show_indexes_sql", "macro_sql": "{% macro default__get_show_indexes_sql(relation) -%}\n {{ exceptions.raise_compiler_error(\"`get_show_indexes_sql has not been implemented for this adapter.\") }}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.205308, "supported_languages": null}, "macro.dbt.make_intermediate_relation": {"name": "make_intermediate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_intermediate_relation", "macro_sql": "{% macro make_intermediate_relation(base_relation, suffix='__dbt_tmp') %}\n {{ return(adapter.dispatch('make_intermediate_relation', 'dbt')(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_intermediate_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2083871, "supported_languages": null}, "macro.dbt.default__make_intermediate_relation": {"name": "default__make_intermediate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_intermediate_relation", "macro_sql": "{% macro default__make_intermediate_relation(base_relation, suffix) %}\n {{ return(default__make_temp_relation(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_temp_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.208574, "supported_languages": null}, "macro.dbt.make_temp_relation": {"name": "make_temp_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_temp_relation", "macro_sql": "{% macro make_temp_relation(base_relation, suffix='__dbt_tmp') %}\n {{ return(adapter.dispatch('make_temp_relation', 'dbt')(base_relation, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__make_temp_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.208811, "supported_languages": null}, "macro.dbt.default__make_temp_relation": {"name": "default__make_temp_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_temp_relation", "macro_sql": "{% macro default__make_temp_relation(base_relation, suffix) %}\n {%- set temp_identifier = base_relation.identifier ~ suffix -%}\n {%- set temp_relation = base_relation.incorporate(\n path={\"identifier\": temp_identifier}) -%}\n\n {{ return(temp_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.209132, "supported_languages": null}, "macro.dbt.make_backup_relation": {"name": "make_backup_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.make_backup_relation", "macro_sql": "{% macro make_backup_relation(base_relation, backup_relation_type, suffix='__dbt_backup') %}\n {{ return(adapter.dispatch('make_backup_relation', 'dbt')(base_relation, backup_relation_type, suffix)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__make_backup_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.209399, "supported_languages": null}, "macro.dbt.default__make_backup_relation": {"name": "default__make_backup_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__make_backup_relation", "macro_sql": "{% macro default__make_backup_relation(base_relation, backup_relation_type, suffix) %}\n {%- set backup_identifier = base_relation.identifier ~ suffix -%}\n {%- set backup_relation = base_relation.incorporate(\n path={\"identifier\": backup_identifier},\n type=backup_relation_type\n ) -%}\n {{ return(backup_relation) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.209753, "supported_languages": null}, "macro.dbt.truncate_relation": {"name": "truncate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.truncate_relation", "macro_sql": "{% macro truncate_relation(relation) -%}\n {{ return(adapter.dispatch('truncate_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__truncate_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.209952, "supported_languages": null}, "macro.dbt.default__truncate_relation": {"name": "default__truncate_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__truncate_relation", "macro_sql": "{% macro default__truncate_relation(relation) -%}\n {% call statement('truncate_relation') -%}\n truncate table {{ relation }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.210135, "supported_languages": null}, "macro.dbt.rename_relation": {"name": "rename_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.rename_relation", "macro_sql": "{% macro rename_relation(from_relation, to_relation) -%}\n {{ return(adapter.dispatch('rename_relation', 'dbt')(from_relation, to_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__rename_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.210358, "supported_languages": null}, "macro.dbt.default__rename_relation": {"name": "default__rename_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__rename_relation", "macro_sql": "{% macro default__rename_relation(from_relation, to_relation) -%}\n {% set target_name = adapter.quote_as_configured(to_relation.identifier, 'identifier') %}\n {% call statement('rename_relation') -%}\n alter table {{ from_relation }} rename to {{ target_name }}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.210745, "supported_languages": null}, "macro.dbt.get_or_create_relation": {"name": "get_or_create_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.get_or_create_relation", "macro_sql": "{% macro get_or_create_relation(database, schema, identifier, type) -%}\n {{ return(adapter.dispatch('get_or_create_relation', 'dbt')(database, schema, identifier, type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_or_create_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.211017, "supported_languages": null}, "macro.dbt.default__get_or_create_relation": {"name": "default__get_or_create_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.default__get_or_create_relation", "macro_sql": "{% macro default__get_or_create_relation(database, schema, identifier, type) %}\n {%- set target_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) %}\n\n {% if target_relation %}\n {% do return([true, target_relation]) %}\n {% endif %}\n\n {%- set new_relation = api.Relation.create(\n database=database,\n schema=schema,\n identifier=identifier,\n type=type\n ) -%}\n {% do return([false, new_relation]) %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.211626, "supported_languages": null}, "macro.dbt.load_cached_relation": {"name": "load_cached_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.load_cached_relation", "macro_sql": "{% macro load_cached_relation(relation) %}\n {% do return(adapter.get_relation(\n database=relation.database,\n schema=relation.schema,\n identifier=relation.identifier\n )) -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.211879, "supported_languages": null}, "macro.dbt.load_relation": {"name": "load_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.load_relation", "macro_sql": "{% macro load_relation(relation) %}\n {{ return(load_cached_relation(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.load_cached_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2120361, "supported_languages": null}, "macro.dbt.drop_relation_if_exists": {"name": "drop_relation_if_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/relation.sql", "original_file_path": "macros/adapters/relation.sql", "unique_id": "macro.dbt.drop_relation_if_exists", "macro_sql": "{% macro drop_relation_if_exists(relation) %}\n {% if relation is not none %}\n {{ adapter.drop_relation(relation) }}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.21225, "supported_languages": null}, "macro.dbt.collect_freshness": {"name": "collect_freshness", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/freshness.sql", "original_file_path": "macros/adapters/freshness.sql", "unique_id": "macro.dbt.collect_freshness", "macro_sql": "{% macro collect_freshness(source, loaded_at_field, filter) %}\n {{ return(adapter.dispatch('collect_freshness', 'dbt')(source, loaded_at_field, filter))}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__collect_freshness"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.212715, "supported_languages": null}, "macro.dbt.default__collect_freshness": {"name": "default__collect_freshness", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/freshness.sql", "original_file_path": "macros/adapters/freshness.sql", "unique_id": "macro.dbt.default__collect_freshness", "macro_sql": "{% macro default__collect_freshness(source, loaded_at_field, filter) %}\n {% call statement('collect_freshness', fetch_result=True, auto_begin=False) -%}\n select\n max({{ loaded_at_field }}) as max_loaded_at,\n {{ current_timestamp() }} as snapshotted_at\n from {{ source }}\n {% if filter %}\n where {{ filter }}\n {% endif %}\n {% endcall %}\n {{ return(load_result('collect_freshness')) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.current_timestamp"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.213162, "supported_languages": null}, "macro.dbt.validate_sql": {"name": "validate_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/validate_sql.sql", "original_file_path": "macros/adapters/validate_sql.sql", "unique_id": "macro.dbt.validate_sql", "macro_sql": "{% macro validate_sql(sql) -%}\n {{ return(adapter.dispatch('validate_sql', 'dbt')(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__validate_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.213469, "supported_languages": null}, "macro.dbt.default__validate_sql": {"name": "default__validate_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/validate_sql.sql", "original_file_path": "macros/adapters/validate_sql.sql", "unique_id": "macro.dbt.default__validate_sql", "macro_sql": "{% macro default__validate_sql(sql) -%}\n {% call statement('validate_sql') -%}\n explain {{ sql }}\n {% endcall %}\n {{ return(load_result('validate_sql')) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2137148, "supported_languages": null}, "macro.dbt.copy_grants": {"name": "copy_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.copy_grants", "macro_sql": "{% macro copy_grants() %}\n {{ return(adapter.dispatch('copy_grants', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__copy_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2155678, "supported_languages": null}, "macro.dbt.default__copy_grants": {"name": "default__copy_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__copy_grants", "macro_sql": "{% macro default__copy_grants() %}\n {{ return(True) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2156968, "supported_languages": null}, "macro.dbt.support_multiple_grantees_per_dcl_statement": {"name": "support_multiple_grantees_per_dcl_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.support_multiple_grantees_per_dcl_statement", "macro_sql": "{% macro support_multiple_grantees_per_dcl_statement() %}\n {{ return(adapter.dispatch('support_multiple_grantees_per_dcl_statement', 'dbt')()) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__support_multiple_grantees_per_dcl_statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2158809, "supported_languages": null}, "macro.dbt.default__support_multiple_grantees_per_dcl_statement": {"name": "default__support_multiple_grantees_per_dcl_statement", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__support_multiple_grantees_per_dcl_statement", "macro_sql": "\n\n{%- macro default__support_multiple_grantees_per_dcl_statement() -%}\n {{ return(True) }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.216007, "supported_languages": null}, "macro.dbt.should_revoke": {"name": "should_revoke", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.should_revoke", "macro_sql": "{% macro should_revoke(existing_relation, full_refresh_mode=True) %}\n\n {% if not existing_relation %}\n {#-- The table doesn't already exist, so no grants to copy over --#}\n {{ return(False) }}\n {% elif full_refresh_mode %}\n {#-- The object is being REPLACED -- whether grants are copied over depends on the value of user config --#}\n {{ return(copy_grants()) }}\n {% else %}\n {#-- The table is being merged/upserted/inserted -- grants will be carried over --#}\n {{ return(True) }}\n {% endif %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.copy_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.216375, "supported_languages": null}, "macro.dbt.get_show_grant_sql": {"name": "get_show_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_show_grant_sql", "macro_sql": "{% macro get_show_grant_sql(relation) %}\n {{ return(adapter.dispatch(\"get_show_grant_sql\", \"dbt\")(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_show_grant_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.216575, "supported_languages": null}, "macro.dbt.default__get_show_grant_sql": {"name": "default__get_show_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_show_grant_sql", "macro_sql": "{% macro default__get_show_grant_sql(relation) %}\n show grants on {{ relation }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2166889, "supported_languages": null}, "macro.dbt.get_grant_sql": {"name": "get_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_grant_sql", "macro_sql": "{% macro get_grant_sql(relation, privilege, grantees) %}\n {{ return(adapter.dispatch('get_grant_sql', 'dbt')(relation, privilege, grantees)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_grant_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.216934, "supported_languages": null}, "macro.dbt.default__get_grant_sql": {"name": "default__get_grant_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_grant_sql", "macro_sql": "\n\n{%- macro default__get_grant_sql(relation, privilege, grantees) -%}\n grant {{ privilege }} on {{ relation }} to {{ grantees | join(', ') }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2358558, "supported_languages": null}, "macro.dbt.get_revoke_sql": {"name": "get_revoke_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_revoke_sql", "macro_sql": "{% macro get_revoke_sql(relation, privilege, grantees) %}\n {{ return(adapter.dispatch('get_revoke_sql', 'dbt')(relation, privilege, grantees)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_revoke_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.236195, "supported_languages": null}, "macro.dbt.default__get_revoke_sql": {"name": "default__get_revoke_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_revoke_sql", "macro_sql": "\n\n{%- macro default__get_revoke_sql(relation, privilege, grantees) -%}\n revoke {{ privilege }} on {{ relation }} from {{ grantees | join(', ') }}\n{%- endmacro -%}\n\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.238879, "supported_languages": null}, "macro.dbt.get_dcl_statement_list": {"name": "get_dcl_statement_list", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.get_dcl_statement_list", "macro_sql": "{% macro get_dcl_statement_list(relation, grant_config, get_dcl_macro) %}\n {{ return(adapter.dispatch('get_dcl_statement_list', 'dbt')(relation, grant_config, get_dcl_macro)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_dcl_statement_list"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.239446, "supported_languages": null}, "macro.dbt.default__get_dcl_statement_list": {"name": "default__get_dcl_statement_list", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__get_dcl_statement_list", "macro_sql": "\n\n{%- macro default__get_dcl_statement_list(relation, grant_config, get_dcl_macro) -%}\n {#\n -- Unpack grant_config into specific privileges and the set of users who need them granted/revoked.\n -- Depending on whether this database supports multiple grantees per statement, pass in the list of\n -- all grantees per privilege, or (if not) template one statement per privilege-grantee pair.\n -- `get_dcl_macro` will be either `get_grant_sql` or `get_revoke_sql`\n #}\n {%- set dcl_statements = [] -%}\n {%- for privilege, grantees in grant_config.items() %}\n {%- if support_multiple_grantees_per_dcl_statement() and grantees -%}\n {%- set dcl = get_dcl_macro(relation, privilege, grantees) -%}\n {%- do dcl_statements.append(dcl) -%}\n {%- else -%}\n {%- for grantee in grantees -%}\n {% set dcl = get_dcl_macro(relation, privilege, [grantee]) %}\n {%- do dcl_statements.append(dcl) -%}\n {% endfor -%}\n {%- endif -%}\n {%- endfor -%}\n {{ return(dcl_statements) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.support_multiple_grantees_per_dcl_statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.25297, "supported_languages": null}, "macro.dbt.call_dcl_statements": {"name": "call_dcl_statements", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.call_dcl_statements", "macro_sql": "{% macro call_dcl_statements(dcl_statement_list) %}\n {{ return(adapter.dispatch(\"call_dcl_statements\", \"dbt\")(dcl_statement_list)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__call_dcl_statements"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.253236, "supported_languages": null}, "macro.dbt.default__call_dcl_statements": {"name": "default__call_dcl_statements", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__call_dcl_statements", "macro_sql": "{% macro default__call_dcl_statements(dcl_statement_list) %}\n {#\n -- By default, supply all grant + revoke statements in a single semicolon-separated block,\n -- so that they're all processed together.\n\n -- Some databases do not support this. Those adapters will need to override this macro\n -- to run each statement individually.\n #}\n {% call statement('grants') %}\n {% for dcl_statement in dcl_statement_list %}\n {{ dcl_statement }};\n {% endfor %}\n {% endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2535172, "supported_languages": null}, "macro.dbt.apply_grants": {"name": "apply_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.apply_grants", "macro_sql": "{% macro apply_grants(relation, grant_config, should_revoke) %}\n {{ return(adapter.dispatch(\"apply_grants\", \"dbt\")(relation, grant_config, should_revoke)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__apply_grants"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2537751, "supported_languages": null}, "macro.dbt.default__apply_grants": {"name": "default__apply_grants", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/apply_grants.sql", "original_file_path": "macros/adapters/apply_grants.sql", "unique_id": "macro.dbt.default__apply_grants", "macro_sql": "{% macro default__apply_grants(relation, grant_config, should_revoke=True) %}\n {#-- If grant_config is {} or None, this is a no-op --#}\n {% if grant_config %}\n {% if should_revoke %}\n {#-- We think previous grants may have carried over --#}\n {#-- Show current grants and calculate diffs --#}\n {% set current_grants_table = run_query(get_show_grant_sql(relation)) %}\n {% set current_grants_dict = adapter.standardize_grants_dict(current_grants_table) %}\n {% set needs_granting = diff_of_two_dicts(grant_config, current_grants_dict) %}\n {% set needs_revoking = diff_of_two_dicts(current_grants_dict, grant_config) %}\n {% if not (needs_granting or needs_revoking) %}\n {{ log('On ' ~ relation ~': All grants are in place, no revocation or granting needed.')}}\n {% endif %}\n {% else %}\n {#-- We don't think there's any chance of previous grants having carried over. --#}\n {#-- Jump straight to granting what the user has configured. --#}\n {% set needs_revoking = {} %}\n {% set needs_granting = grant_config %}\n {% endif %}\n {% if needs_granting or needs_revoking %}\n {% set revoke_statement_list = get_dcl_statement_list(relation, needs_revoking, get_revoke_sql) %}\n {% set grant_statement_list = get_dcl_statement_list(relation, needs_granting, get_grant_sql) %}\n {% set dcl_statement_list = revoke_statement_list + grant_statement_list %}\n {% if dcl_statement_list %}\n {{ call_dcl_statements(dcl_statement_list) }}\n {% endif %}\n {% endif %}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query", "macro.dbt.get_show_grant_sql", "macro.dbt.get_dcl_statement_list", "macro.dbt.call_dcl_statements"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.255027, "supported_languages": null}, "macro.dbt.get_show_sql": {"name": "get_show_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/show.sql", "original_file_path": "macros/adapters/show.sql", "unique_id": "macro.dbt.get_show_sql", "macro_sql": "{% macro get_show_sql(compiled_code, sql_header, limit) -%}\n {%- if sql_header -%}\n {{ sql_header }}\n {%- endif -%}\n {%- if limit is not none -%}\n {{ get_limit_subquery_sql(compiled_code, limit) }}\n {%- else -%}\n {{ compiled_code }}\n {%- endif -%}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_limit_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2556338, "supported_languages": null}, "macro.dbt.get_limit_subquery_sql": {"name": "get_limit_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/show.sql", "original_file_path": "macros/adapters/show.sql", "unique_id": "macro.dbt.get_limit_subquery_sql", "macro_sql": "{% macro get_limit_subquery_sql(sql, limit) %}\n {{ adapter.dispatch('get_limit_subquery_sql', 'dbt')(sql, limit) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_limit_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2558432, "supported_languages": null}, "macro.dbt.default__get_limit_subquery_sql": {"name": "default__get_limit_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/show.sql", "original_file_path": "macros/adapters/show.sql", "unique_id": "macro.dbt.default__get_limit_subquery_sql", "macro_sql": "{% macro default__get_limit_subquery_sql(sql, limit) %}\n select *\n from (\n {{ sql }}\n ) as model_limit_subq\n limit {{ limit }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.255991, "supported_languages": null}, "macro.dbt.alter_column_comment": {"name": "alter_column_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.alter_column_comment", "macro_sql": "{% macro alter_column_comment(relation, column_dict) -%}\n {{ return(adapter.dispatch('alter_column_comment', 'dbt')(relation, column_dict)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_column_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.256733, "supported_languages": null}, "macro.dbt.default__alter_column_comment": {"name": "default__alter_column_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__alter_column_comment", "macro_sql": "{% macro default__alter_column_comment(relation, column_dict) -%}\n {{ exceptions.raise_not_implemented(\n 'alter_column_comment macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.256926, "supported_languages": null}, "macro.dbt.alter_relation_comment": {"name": "alter_relation_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.alter_relation_comment", "macro_sql": "{% macro alter_relation_comment(relation, relation_comment) -%}\n {{ return(adapter.dispatch('alter_relation_comment', 'dbt')(relation, relation_comment)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_relation_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.257153, "supported_languages": null}, "macro.dbt.default__alter_relation_comment": {"name": "default__alter_relation_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__alter_relation_comment", "macro_sql": "{% macro default__alter_relation_comment(relation, relation_comment) -%}\n {{ exceptions.raise_not_implemented(\n 'alter_relation_comment macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2573419, "supported_languages": null}, "macro.dbt.persist_docs": {"name": "persist_docs", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.persist_docs", "macro_sql": "{% macro persist_docs(relation, model, for_relation=true, for_columns=true) -%}\n {{ return(adapter.dispatch('persist_docs', 'dbt')(relation, model, for_relation, for_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__persist_docs"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.257635, "supported_languages": null}, "macro.dbt.default__persist_docs": {"name": "default__persist_docs", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/persist_docs.sql", "original_file_path": "macros/adapters/persist_docs.sql", "unique_id": "macro.dbt.default__persist_docs", "macro_sql": "{% macro default__persist_docs(relation, model, for_relation, for_columns) -%}\n {% if for_relation and config.persist_relation_docs() and model.description %}\n {% do run_query(alter_relation_comment(relation, model.description)) %}\n {% endif %}\n\n {% if for_columns and config.persist_column_docs() and model.columns %}\n {% do run_query(alter_column_comment(relation, model.columns)) %}\n {% endif %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query", "macro.dbt.alter_relation_comment", "macro.dbt.alter_column_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.258165, "supported_languages": null}, "macro.dbt.get_catalog": {"name": "get_catalog", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.get_catalog", "macro_sql": "{% macro get_catalog(information_schema, schemas) -%}\n {{ return(adapter.dispatch('get_catalog', 'dbt')(information_schema, schemas)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_catalog"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.259836, "supported_languages": null}, "macro.dbt.default__get_catalog": {"name": "default__get_catalog", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__get_catalog", "macro_sql": "{% macro default__get_catalog(information_schema, schemas) -%}\n\n {% set typename = adapter.type() %}\n {% set msg -%}\n get_catalog not implemented for {{ typename }}\n {%- endset %}\n\n {{ exceptions.raise_compiler_error(msg) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2601228, "supported_languages": null}, "macro.dbt.information_schema_name": {"name": "information_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.information_schema_name", "macro_sql": "{% macro information_schema_name(database) %}\n {{ return(adapter.dispatch('information_schema_name', 'dbt')(database)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__information_schema_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.26033, "supported_languages": null}, "macro.dbt.default__information_schema_name": {"name": "default__information_schema_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__information_schema_name", "macro_sql": "{% macro default__information_schema_name(database) -%}\n {%- if database -%}\n {{ database }}.INFORMATION_SCHEMA\n {%- else -%}\n INFORMATION_SCHEMA\n {%- endif -%}\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.260512, "supported_languages": null}, "macro.dbt.list_schemas": {"name": "list_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.list_schemas", "macro_sql": "{% macro list_schemas(database) -%}\n {{ return(adapter.dispatch('list_schemas', 'dbt')(database)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__list_schemas"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.261281, "supported_languages": null}, "macro.dbt.default__list_schemas": {"name": "default__list_schemas", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__list_schemas", "macro_sql": "{% macro default__list_schemas(database) -%}\n {% set sql %}\n select distinct schema_name\n from {{ information_schema_name(database) }}.SCHEMATA\n where catalog_name ilike '{{ database }}'\n {% endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.information_schema_name", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.263762, "supported_languages": null}, "macro.dbt.check_schema_exists": {"name": "check_schema_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.check_schema_exists", "macro_sql": "{% macro check_schema_exists(information_schema, schema) -%}\n {{ return(adapter.dispatch('check_schema_exists', 'dbt')(information_schema, schema)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__check_schema_exists"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.268571, "supported_languages": null}, "macro.dbt.default__check_schema_exists": {"name": "default__check_schema_exists", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__check_schema_exists", "macro_sql": "{% macro default__check_schema_exists(information_schema, schema) -%}\n {% set sql -%}\n select count(*)\n from {{ information_schema.replace(information_schema_view='SCHEMATA') }}\n where catalog_name='{{ information_schema.database }}'\n and schema_name='{{ schema }}'\n {%- endset %}\n {{ return(run_query(sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.replace", "macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2850618, "supported_languages": null}, "macro.dbt.list_relations_without_caching": {"name": "list_relations_without_caching", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.list_relations_without_caching", "macro_sql": "{% macro list_relations_without_caching(schema_relation) %}\n {{ return(adapter.dispatch('list_relations_without_caching', 'dbt')(schema_relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__list_relations_without_caching"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2853088, "supported_languages": null}, "macro.dbt.default__list_relations_without_caching": {"name": "default__list_relations_without_caching", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/metadata.sql", "original_file_path": "macros/adapters/metadata.sql", "unique_id": "macro.dbt.default__list_relations_without_caching", "macro_sql": "{% macro default__list_relations_without_caching(schema_relation) %}\n {{ exceptions.raise_not_implemented(\n 'list_relations_without_caching macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2854989, "supported_languages": null}, "macro.dbt.drop_relation": {"name": "drop_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_relation", "macro_sql": "{% macro drop_relation(relation) -%}\n {{ return(adapter.dispatch('drop_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__drop_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.286558, "supported_languages": null}, "macro.dbt.default__drop_relation": {"name": "default__drop_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_relation", "macro_sql": "{% macro default__drop_relation(relation) -%}\n {% call statement('drop_relation', auto_begin=False) -%}\n {%- if relation.is_table -%}\n {{- drop_table(relation) -}}\n {%- elif relation.is_view -%}\n {{- drop_view(relation) -}}\n {%- elif relation.is_materialized_view -%}\n {{- drop_materialized_view(relation) -}}\n {%- else -%}\n drop {{ relation.type }} if exists {{ relation }} cascade\n {%- endif -%}\n {%- endcall %}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.drop_table", "macro.dbt.drop_view", "macro.dbt.drop_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.287057, "supported_languages": null}, "macro.dbt.drop_table": {"name": "drop_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_table", "macro_sql": "{% macro drop_table(relation) -%}\n {{ return(adapter.dispatch('drop_table', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_table"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2872589, "supported_languages": null}, "macro.dbt.default__drop_table": {"name": "default__drop_table", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_table", "macro_sql": "{% macro default__drop_table(relation) -%}\n drop table if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2873752, "supported_languages": null}, "macro.dbt.drop_view": {"name": "drop_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_view", "macro_sql": "{% macro drop_view(relation) -%}\n {{ return(adapter.dispatch('drop_view', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.287576, "supported_languages": null}, "macro.dbt.default__drop_view": {"name": "default__drop_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_view", "macro_sql": "{% macro default__drop_view(relation) -%}\n drop view if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.287691, "supported_languages": null}, "macro.dbt.drop_materialized_view": {"name": "drop_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.drop_materialized_view", "macro_sql": "{% macro drop_materialized_view(relation) -%}\n {{ return(adapter.dispatch('drop_materialized_view', 'dbt')(relation)) }}\n{%- endmacro %}", "depends_on": {"macros": ["macro.dbt.default__drop_materialized_view"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.287882, "supported_languages": null}, "macro.dbt.default__drop_materialized_view": {"name": "default__drop_materialized_view", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/drop_relation.sql", "original_file_path": "macros/adapters/drop_relation.sql", "unique_id": "macro.dbt.default__drop_materialized_view", "macro_sql": "{% macro default__drop_materialized_view(relation) -%}\n drop materialized view if exists {{ relation }} cascade\n{%- endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.287997, "supported_languages": null}, "macro.dbt.get_columns_in_relation": {"name": "get_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_columns_in_relation", "macro_sql": "{% macro get_columns_in_relation(relation) -%}\n {{ return(adapter.dispatch('get_columns_in_relation', 'dbt')(relation)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__get_columns_in_relation"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2935228, "supported_languages": null}, "macro.dbt.default__get_columns_in_relation": {"name": "default__get_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_columns_in_relation", "macro_sql": "{% macro default__get_columns_in_relation(relation) -%}\n {{ exceptions.raise_not_implemented(\n 'get_columns_in_relation macro not implemented for adapter '+adapter.type()) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.2937138, "supported_languages": null}, "macro.dbt.sql_convert_columns_in_relation": {"name": "sql_convert_columns_in_relation", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.sql_convert_columns_in_relation", "macro_sql": "{% macro sql_convert_columns_in_relation(table) -%}\n {% set columns = [] %}\n {% for row in table %}\n {% do columns.append(api.Column(*row)) %}\n {% endfor %}\n {{ return(columns) }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.294059, "supported_languages": null}, "macro.dbt.get_empty_subquery_sql": {"name": "get_empty_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_empty_subquery_sql", "macro_sql": "{% macro get_empty_subquery_sql(select_sql, select_sql_header=none) -%}\n {{ return(adapter.dispatch('get_empty_subquery_sql', 'dbt')(select_sql, select_sql_header)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.294305, "supported_languages": null}, "macro.dbt.default__get_empty_subquery_sql": {"name": "default__get_empty_subquery_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_empty_subquery_sql", "macro_sql": "{% macro default__get_empty_subquery_sql(select_sql, select_sql_header=none) %}\n {%- if select_sql_header is not none -%}\n {{ select_sql_header }}\n {%- endif -%}\n select * from (\n {{ select_sql }}\n ) as __dbt_sbq\n where false\n limit 0\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.294533, "supported_languages": null}, "macro.dbt.get_empty_schema_sql": {"name": "get_empty_schema_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_empty_schema_sql", "macro_sql": "{% macro get_empty_schema_sql(columns) -%}\n {{ return(adapter.dispatch('get_empty_schema_sql', 'dbt')(columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_empty_schema_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.314576, "supported_languages": null}, "macro.dbt.default__get_empty_schema_sql": {"name": "default__get_empty_schema_sql", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_empty_schema_sql", "macro_sql": "{% macro default__get_empty_schema_sql(columns) %}\n {%- set col_err = [] -%}\n select\n {% for i in columns %}\n {%- set col = columns[i] -%}\n {%- if col['data_type'] is not defined -%}\n {{ col_err.append(col['name']) }}\n {%- endif -%}\n {% set col_name = adapter.quote(col['name']) if col.get('quote') else col['name'] %}\n cast(null as {{ col['data_type'] }}) as {{ col_name }}{{ \", \" if not loop.last }}\n {%- endfor -%}\n {%- if (col_err | length) > 0 -%}\n {{ exceptions.column_type_missing(column_names=col_err) }}\n {%- endif -%}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.32949, "supported_languages": null}, "macro.dbt.get_column_schema_from_query": {"name": "get_column_schema_from_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_column_schema_from_query", "macro_sql": "{% macro get_column_schema_from_query(select_sql, select_sql_header=none) -%}\n {% set columns = [] %}\n {# -- Using an 'empty subquery' here to get the same schema as the given select_sql statement, without necessitating a data scan.#}\n {% set sql = get_empty_subquery_sql(select_sql, select_sql_header) %}\n {% set column_schema = adapter.get_column_schema_from_query(sql) %}\n {{ return(column_schema) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.329921, "supported_languages": null}, "macro.dbt.get_columns_in_query": {"name": "get_columns_in_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.get_columns_in_query", "macro_sql": "{% macro get_columns_in_query(select_sql) -%}\n {{ return(adapter.dispatch('get_columns_in_query', 'dbt')(select_sql)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__get_columns_in_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.330139, "supported_languages": null}, "macro.dbt.default__get_columns_in_query": {"name": "default__get_columns_in_query", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__get_columns_in_query", "macro_sql": "{% macro default__get_columns_in_query(select_sql) %}\n {% call statement('get_columns_in_query', fetch_result=True, auto_begin=False) -%}\n {{ get_empty_subquery_sql(select_sql) }}\n {% endcall %}\n {{ return(load_result('get_columns_in_query').table.columns | map(attribute='name') | list) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement", "macro.dbt.get_empty_subquery_sql"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.330535, "supported_languages": null}, "macro.dbt.alter_column_type": {"name": "alter_column_type", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.alter_column_type", "macro_sql": "{% macro alter_column_type(relation, column_name, new_column_type) -%}\n {{ return(adapter.dispatch('alter_column_type', 'dbt')(relation, column_name, new_column_type)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__alter_column_type"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.330796, "supported_languages": null}, "macro.dbt.default__alter_column_type": {"name": "default__alter_column_type", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__alter_column_type", "macro_sql": "{% macro default__alter_column_type(relation, column_name, new_column_type) -%}\n {#\n 1. Create a new column (w/ temp name and correct type)\n 2. Copy data over to it\n 3. Drop the existing column (cascade!)\n 4. Rename the new column to existing column\n #}\n {%- set tmp_column = column_name + \"__dbt_alter\" -%}\n\n {% call statement('alter_column_type') %}\n alter table {{ relation }} add column {{ adapter.quote(tmp_column) }} {{ new_column_type }};\n update {{ relation }} set {{ adapter.quote(tmp_column) }} = {{ adapter.quote(column_name) }};\n alter table {{ relation }} drop column {{ adapter.quote(column_name) }} cascade;\n alter table {{ relation }} rename column {{ adapter.quote(tmp_column) }} to {{ adapter.quote(column_name) }}\n {% endcall %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.statement"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.331477, "supported_languages": null}, "macro.dbt.alter_relation_add_remove_columns": {"name": "alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.alter_relation_add_remove_columns", "macro_sql": "{% macro alter_relation_add_remove_columns(relation, add_columns = none, remove_columns = none) -%}\n {{ return(adapter.dispatch('alter_relation_add_remove_columns', 'dbt')(relation, add_columns, remove_columns)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt_duckdb.duckdb__alter_relation_add_remove_columns"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.331763, "supported_languages": null}, "macro.dbt.default__alter_relation_add_remove_columns": {"name": "default__alter_relation_add_remove_columns", "resource_type": "macro", "package_name": "dbt", "path": "macros/adapters/columns.sql", "original_file_path": "macros/adapters/columns.sql", "unique_id": "macro.dbt.default__alter_relation_add_remove_columns", "macro_sql": "{% macro default__alter_relation_add_remove_columns(relation, add_columns, remove_columns) %}\n\n {% if add_columns is none %}\n {% set add_columns = [] %}\n {% endif %}\n {% if remove_columns is none %}\n {% set remove_columns = [] %}\n {% endif %}\n\n {% set sql -%}\n\n alter {{ relation.type }} {{ relation }}\n\n {% for column in add_columns %}\n add column {{ column.name }} {{ column.data_type }}{{ ',' if not loop.last }}\n {% endfor %}{{ ',' if add_columns and remove_columns }}\n\n {% for column in remove_columns %}\n drop column {{ column.name }}{{ ',' if not loop.last }}\n {% endfor %}\n\n {%- endset -%}\n\n {% do run_query(sql) %}\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.run_query"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.33264, "supported_languages": null}, "macro.dbt.resolve_model_name": {"name": "resolve_model_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.resolve_model_name", "macro_sql": "{% macro resolve_model_name(input_model_name) %}\n {{ return(adapter.dispatch('resolve_model_name', 'dbt')(input_model_name)) }}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.default__resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.335671, "supported_languages": null}, "macro.dbt.default__resolve_model_name": {"name": "default__resolve_model_name", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.default__resolve_model_name", "macro_sql": "\n\n{%- macro default__resolve_model_name(input_model_name) -%}\n {{ input_model_name | string | replace('\"', '\\\"') }}\n{%- endmacro -%}\n\n", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.335859, "supported_languages": null}, "macro.dbt.build_ref_function": {"name": "build_ref_function", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_ref_function", "macro_sql": "{% macro build_ref_function(model) %}\n\n {%- set ref_dict = {} -%}\n {%- for _ref in model.refs -%}\n {% set _ref_args = [_ref.get('package'), _ref['name']] if _ref.get('package') else [_ref['name'],] %}\n {%- set resolved = ref(*_ref_args, v=_ref.get('version')) -%}\n {%- if _ref.get('version') -%}\n {% do _ref_args.extend([\"v\" ~ _ref['version']]) %}\n {%- endif -%}\n {%- do ref_dict.update({_ref_args | join('.'): resolve_model_name(resolved)}) -%}\n {%- endfor -%}\n\ndef ref(*args, **kwargs):\n refs = {{ ref_dict | tojson }}\n key = '.'.join(args)\n version = kwargs.get(\"v\") or kwargs.get(\"version\")\n if version:\n key += f\".v{version}\"\n dbt_load_df_function = kwargs.get(\"dbt_load_df_function\")\n return dbt_load_df_function(refs[key])\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.336771, "supported_languages": null}, "macro.dbt.build_source_function": {"name": "build_source_function", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_source_function", "macro_sql": "{% macro build_source_function(model) %}\n\n {%- set source_dict = {} -%}\n {%- for _source in model.sources -%}\n {%- set resolved = source(*_source) -%}\n {%- do source_dict.update({_source | join('.'): resolve_model_name(resolved)}) -%}\n {%- endfor -%}\n\ndef source(*args, dbt_load_df_function):\n sources = {{ source_dict | tojson }}\n key = '.'.join(args)\n return dbt_load_df_function(sources[key])\n\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.resolve_model_name"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.337225, "supported_languages": null}, "macro.dbt.build_config_dict": {"name": "build_config_dict", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.build_config_dict", "macro_sql": "{% macro build_config_dict(model) %}\n {%- set config_dict = {} -%}\n {% set config_dbt_used = zip(model.config.config_keys_used, model.config.config_keys_defaults) | list %}\n {%- for key, default in config_dbt_used -%}\n {# weird type testing with enum, would be much easier to write this logic in Python! #}\n {%- if key == \"language\" -%}\n {%- set value = \"python\" -%}\n {%- endif -%}\n {%- set value = model.config.get(key, default) -%}\n {%- do config_dict.update({key: value}) -%}\n {%- endfor -%}\nconfig_dict = {{ config_dict }}\n{% endmacro %}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.3378592, "supported_languages": null}, "macro.dbt.py_script_postfix": {"name": "py_script_postfix", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.py_script_postfix", "macro_sql": "{% macro py_script_postfix(model) %}\n# This part is user provided model code\n# you will need to copy the next section to run the code\n# COMMAND ----------\n# this part is dbt logic for get ref work, do not modify\n\n{{ build_ref_function(model ) }}\n{{ build_source_function(model ) }}\n{{ build_config_dict(model) }}\n\nclass config:\n def __init__(self, *args, **kwargs):\n pass\n\n @staticmethod\n def get(key, default=None):\n return config_dict.get(key, default)\n\nclass this:\n \"\"\"dbt.this() or dbt.this.identifier\"\"\"\n database = \"{{ this.database }}\"\n schema = \"{{ this.schema }}\"\n identifier = \"{{ this.identifier }}\"\n {% set this_relation_name = resolve_model_name(this) %}\n def __repr__(self):\n return '{{ this_relation_name }}'\n\n\nclass dbtObj:\n def __init__(self, load_df_function) -> None:\n self.source = lambda *args: source(*args, dbt_load_df_function=load_df_function)\n self.ref = lambda *args, **kwargs: ref(*args, **kwargs, dbt_load_df_function=load_df_function)\n self.config = config\n self.this = this()\n self.is_incremental = {{ is_incremental() }}\n\n# COMMAND ----------\n{{py_script_comment()}}\n{% endmacro %}", "depends_on": {"macros": ["macro.dbt.build_ref_function", "macro.dbt.build_source_function", "macro.dbt.build_config_dict", "macro.dbt.resolve_model_name", "macro.dbt.is_incremental", "macro.dbt.py_script_comment"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.33838, "supported_languages": null}, "macro.dbt.py_script_comment": {"name": "py_script_comment", "resource_type": "macro", "package_name": "dbt", "path": "macros/python_model/python.sql", "original_file_path": "macros/python_model/python.sql", "unique_id": "macro.dbt.py_script_comment", "macro_sql": "{%macro py_script_comment()%}\n{%endmacro%}", "depends_on": {"macros": []}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.3384619, "supported_languages": null}, "macro.dbt.test_unique": {"name": "test_unique", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_unique", "macro_sql": "{% test unique(model, column_name) %}\n {% set macro = adapter.dispatch('test_unique', 'dbt') %}\n {{ macro(model, column_name) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_unique"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.339039, "supported_languages": null}, "macro.dbt.test_not_null": {"name": "test_not_null", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_not_null", "macro_sql": "{% test not_null(model, column_name) %}\n {% set macro = adapter.dispatch('test_not_null', 'dbt') %}\n {{ macro(model, column_name) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_not_null"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.3392918, "supported_languages": null}, "macro.dbt.test_accepted_values": {"name": "test_accepted_values", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_accepted_values", "macro_sql": "{% test accepted_values(model, column_name, values, quote=True) %}\n {% set macro = adapter.dispatch('test_accepted_values', 'dbt') %}\n {{ macro(model, column_name, values, quote) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_accepted_values"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.339603, "supported_languages": null}, "macro.dbt.test_relationships": {"name": "test_relationships", "resource_type": "macro", "package_name": "dbt", "path": "tests/generic/builtin.sql", "original_file_path": "tests/generic/builtin.sql", "unique_id": "macro.dbt.test_relationships", "macro_sql": "{% test relationships(model, column_name, to, field) %}\n {% set macro = adapter.dispatch('test_relationships', 'dbt') %}\n {{ macro(model, column_name, to, field) }}\n{% endtest %}", "depends_on": {"macros": ["macro.dbt.default__test_relationships"]}, "description": "", "meta": {}, "docs": {"show": true, "node_color": null}, "patch_path": null, "arguments": [], "created_at": 1697065877.3400161, "supported_languages": null}}, "docs": {"doc.dbt.__overview__": {"name": "__overview__", "resource_type": "doc", "package_name": "dbt", "path": "overview.md", "original_file_path": "docs/overview.md", "unique_id": "doc.dbt.__overview__", "block_contents": "### Welcome!\n\nWelcome to the auto-generated documentation for your dbt project!\n\n### Navigation\n\nYou can use the `Project` and `Database` navigation tabs on the left side of the window to explore the models\nin your project.\n\n#### Project Tab\nThe `Project` tab mirrors the directory structure of your dbt project. In this tab, you can see all of the\nmodels defined in your dbt project, as well as models imported from dbt packages.\n\n#### Database Tab\nThe `Database` tab also exposes your models, but in a format that looks more like a database explorer. This view\nshows relations (tables and views) grouped into database schemas. Note that ephemeral models are _not_ shown\nin this interface, as they do not exist in the database.\n\n### Graph Exploration\nYou can click the blue icon on the bottom-right corner of the page to view the lineage graph of your models.\n\nOn model pages, you'll see the immediate parents and children of the model you're exploring. By clicking the `Expand`\nbutton at the top-right of this lineage pane, you'll be able to see all of the models that are used to build,\nor are built from, the model you're exploring.\n\nOnce expanded, you'll be able to use the `--select` and `--exclude` model selection syntax to filter the\nmodels in the graph. For more information on model selection, check out the [dbt docs](https://docs.getdbt.com/docs/model-selection-syntax).\n\nNote that you can also right-click on models to interactively filter and explore the graph.\n\n---\n\n### More information\n\n- [What is dbt](https://docs.getdbt.com/docs/introduction)?\n- Read the [dbt viewpoint](https://docs.getdbt.com/docs/viewpoint)\n- [Installation](https://docs.getdbt.com/docs/installation)\n- Join the [dbt Community](https://www.getdbt.com/community/) for questions and discussion"}}, "exposures": {}, "metrics": {}, "groups": {}, "selectors": {}, "disabled": {}, "parent_map": {"model.hacker_news_dbt.comment_daily_stats": ["source.hacker_news_dbt.core.comments"], "model.hacker_news_dbt.activity_daily_stats": ["model.hacker_news_dbt.comment_daily_stats", "model.hacker_news_dbt.story_daily_stats"], "model.hacker_news_dbt.story_daily_stats": ["source.hacker_news_dbt.core.stories"], "test.hacker_news_dbt.assert_true": [], "seed.hacker_news_dbt.full_sample": [], "source.hacker_news_dbt.core.comments": [], "source.hacker_news_dbt.core.stories": []}, "child_map": {"model.hacker_news_dbt.comment_daily_stats": ["model.hacker_news_dbt.activity_daily_stats"], "model.hacker_news_dbt.activity_daily_stats": [], "model.hacker_news_dbt.story_daily_stats": ["model.hacker_news_dbt.activity_daily_stats"], "test.hacker_news_dbt.assert_true": [], "seed.hacker_news_dbt.full_sample": [], "source.hacker_news_dbt.core.comments": ["model.hacker_news_dbt.comment_daily_stats"], "source.hacker_news_dbt.core.stories": ["model.hacker_news_dbt.story_daily_stats"]}, "group_map": {}, "semantic_models": {}} \ No newline at end of file